Implementaci�n en OpenCL

1 Licencia de los Trabajos

En el siguiente se expresa la licencia de cada trabajo.

1.1 clhelpers

/* 
   Copyright 2019 Christian Gimenez

   Author: Christian Gimenez

   clhelpers.hpp

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/* 
   Copyright 2019 Christian Gimenez

   Author: Christian Gimenez

   clhelpers.cpp

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

1.2 Convoluci�n 1D

/* 

   Copyright 2019 Christian Gimenez

   Author: Christian Gimenez

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/* 

   Copyright 2019 Christian Gimenez

   Author: Christian Gimenez

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#! /usr/bin/fish

# Copyright 2019 Christian Gimenez

# Author: Christian Gimenez

# compile.fish

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

2 Biblioteca cpu_timer.h

Se utilizar� la biblioteca cpu_timer.h prove�da en el curso.

#pragma once
#include <ctime>

struct timespec diff(timespec start, timespec end)
{
        timespec temp;
        if ((end.tv_nsec-start.tv_nsec)<0) {
                temp.tv_sec = end.tv_sec-start.tv_sec-1;
                temp.tv_nsec = 1000000000+end.tv_nsec-start.tv_nsec;
        } else {
                temp.tv_sec = end.tv_sec-start.tv_sec;
                temp.tv_nsec = end.tv_nsec-start.tv_nsec;
        }
        return temp;
}

struct cpu_timer{
        struct timespec time1, time2;
        double ms_elapsed;

        cpu_timer(){
                tic();
        }
       ~cpu_timer(){}

        void tic(){
                clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time1);
        }
        double tac(){
                clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time2);
                return(ms_elapsed=elapsed());
        }
        double elapsed(){
            return (double)diff(time1,time2).tv_sec*1000 + (double)diff(time1,time2).tv_nsec*0.000001;
        }
};

#define CRONOMETRAR_CPU( X,VECES ) {  { \
                            cpu_timer t; \
                            float msacum=0.0;\
                            float msacum2=0.0;\
                            for(int n=0;n<VECES;n++){\
                                t.tic();\
                                X; t.tac();\
                                msacum+=t.ms_elapsed;\
                                msacum2+=(t.ms_elapsed*t.ms_elapsed);\
                            }\
                            std::cout << "CPU: " << (msacum/VECES) << " +- " << \
                            (sqrt(msacum2/VECES - msacum*msacum/VECES/VECES)) \
                            << " ms (" << VECES << " veces)\n"; \
                            }}

3 Biblioteca clhelpers

Hay una serie de tareas que se realizan de forma com�n entre los ejercicios. Estos se implementaron como funciones dentro de la biblioteca clhelpers. A continuaci�n se describen sus archivos.

3.1 clhelpers.hpp

Macros para evitar la inclusi�n de la librer�a m�s de una vez.

#ifndef _CLHELPERS_HPP
#define _CLHELPERS_HPP 1

Se utilizar� el tipo string de C++.

#include <string>

using namespace std;

Interpreta el n�mero de error de clCreateKernel.

/**
 Show in stdout a string that represent the Create Kernel Error.

 @param errnum cl_int A CL error number returned by clCreateKernel.
 */
void report_create_kernel_err(cl_int errNum);

/**
 Print a particular device information value to stdout.

 Search for the value of a given field and print it to stdout.

 @param name The name of the CL field. 
 @param str A label to print before the value.
 */
void print_dev_info(cl_device_id dev_id,
                    cl_device_info name,
                    std::string str);

/**
 Print a particular platform information value to stdout.

 Search for the value of a given field and print it to stdout.

 @param str A label to print before de value.
 */
void print_platform_info(cl_platform_id platform_id,
                         cl_platform_info name,
                         std::string str);

/**
 Print all platform information.

 @param platformIDs An array of platforms ids.
 @param numPlatforms The amount of platforms in the array.
 */
void print_platforms(cl_platform_id* platformIDs, cl_uint numPlatforms);

/**
 Print the context information.

 Print the platform selected and the devices used for the context.
 */
void print_context_info(cl_platform_id id,
                        int numDevices,
                        cl_device_id *deviceIDs);

/**
 Return all the platforms identified by OpenCL.

 @param platform_ids Output. An array with platforms IDs.
 @param num_platforms Output. The amount of platforms founded.
 */
void all_platforms(cl_platform_id **platform_ids, cl_uint *num_platforms);

/**
 Search for the platform name and return its ID.

 @param name The platform name to search.
 @param platformIDs The IDs of the platform to search within.
 @param numPlatforms The amount of platforms IDs.
 @param selected_id Output. The platform ID with the same name as the given.
 @return true if founded, false otherwise.
 */
bool select_platform(const char* name,
                     cl_platform_id* platformIDs,
                     cl_uint numPlatforms,
                     cl_platform_id *selected_id);

void select_devices(cl_platform_id selected_platform,
                    cl_device_type type,
                    cl_device_id **device_ids,
                    cl_uint* num_devices,
                    cl_uint* platformI);

void create_context(cl_platform_id platform_id,
                    cl_device_id *deviceIDs,
                    cl_uint numDevices,
                    cl_context *context);

void load_code(const char* filename, string* src);

void compile_code(cl_context context,
                  cl_uint numDevices,
                  cl_device_id *deviceIDs,
                  string src,
                  cl_program *program);

void create_kernel(cl_program program,
                   const char* kernel_name,
                   cl_kernel *kernel);

Macro para indicar el fin del if.

#endif /* _CLHELPERS_HPP */

3.2 clhelpers.cpp

Bibliotecas que se van a utilizar.

#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#include <cstdlib>
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <cstring>

Vamos a utilizar una funci�n de callback para cuando surga un error. Se solicita al ejecutar clCreateContext. Lo que hace esta funci�n es reportar el error y salir del programa.

En caso de que la macro CL_CALLBACK no est� definida, se la define para evitar errores de precompilaci�n.

#if !defined(CL_CALLBACK)
#define CL_CALLBACK
#endif

//#include "clhelpers.hpp"

void CL_CALLBACK contextCallback(const char *errInfo, const void *private_inde,
                                 size_t cb, void *user_data){
  std::cerr << "Error occured during context use: " << errInfo << std::endl;
  exit(1);
}

Funci�n para mapear el error de clCreateKernel a un string y mostrarlo en pantalla.

void report_create_kernel_err(cl_int errNum){
  std::cout << "Create Kernel Error (" << errNum << "): " << std::endl;
  switch (errNum){
  case CL_INVALID_PROGRAM:
    std::cout << "Invalid program" << std::endl;
    break;
  case CL_INVALID_PROGRAM_EXECUTABLE:
    std::cout << "Invalid program executable" << std::endl;
    break;
  case CL_INVALID_KERNEL_NAME:
    std::cout << "Invalid kernel name" << std::endl;
    break;
  case CL_INVALID_KERNEL_DEFINITION:
    std::cout << "Invalid kernel definition" << std::endl;
    break;
  case CL_INVALID_VALUE:
    std::cout << "Invalid value" << std::endl;
    break;
  case CL_OUT_OF_HOST_MEMORY:
    std::cout << "Out of host memory" << std::endl;
    break;
  default:
    std::cout << "Unknown error" << std::endl;
  }
}

Funci�n para imprimir un campo de la informaci�n de un dispositivo. Se espera una etiqueta str para imprimir en pantalla previo al valor del campo.

Como convenci�n de OpenCL, el primer clGetDeviceInfo tiene tama�o cero y NULL en el espacio de salida para indicar que se desea saber el tama�o de memoria que se debe reservar para el valor. V�ase la documentaci�n de clGetDeviceInfo.

Tambi�n se incorpora un poco de parseo en el valor para mostrarlo de forma m�s legible.

void print_dev_info(cl_device_id dev_id,
                    cl_device_info name,
                    std::string str){
  size_t size;

  clGetDeviceInfo(dev_id, name, 0, NULL, &size);
  char *info = (char*) alloca( sizeof(char) * size);
  clGetDeviceInfo(dev_id, name, size, info, NULL);

  std::cout << str << ": ";
  if (name == CL_DEVICE_TYPE){
    cl_device_type t = *(reinterpret_cast<cl_device_type*>(info));

    switch (t){
    case CL_DEVICE_TYPE_CPU:
      std::cout << "CPU" << std::endl;
      break;
    case CL_DEVICE_TYPE_GPU:
      std::cout << "GPU" << std::endl;
      break;
    case CL_DEVICE_TYPE_ACCELERATOR:
      std::cout << "Accelerator" << std::endl;
      break;
    case CL_DEVICE_TYPE_DEFAULT:
      std::cout << "Default" << std::endl;
      break;
    }

  }else{
    std::cout << info << std::endl;
  }
}

Mismo que el anterior, pero aplicado a un campo de una plataforma.

void print_platform_info(cl_platform_id platform_id,
                         cl_platform_info name,
                         std::string str){
  cl_uint err;
  std::size_t size;

  err = clGetPlatformInfo(platform_id, name, 0, NULL, &size);
  if (err != CL_SUCCESS){
    std::cout << "Cannot retrieve " << str << std::endl;
    return;
  }
  char *info = (char*) alloca( sizeof(char) * size);
  err = clGetPlatformInfo(platform_id, name, size, info, NULL);
  if (err != CL_SUCCESS){
    std::cout << "Cannot retrieve " << str << std::endl;
    return;
  }

  std::cout << str << ": " << info << std::endl;
}

Imprimir el nombre y vendor de todas las plataformas dadas en platformsIDs.

void print_platforms(cl_platform_id* platformsIDs, cl_uint numPlatforms){
  std::cout << "Platforms" << std::endl;
  for (cl_uint i = 0; i < numPlatforms; i++){
    print_platform_info(platformsIDs[i], CL_PLATFORM_NAME, "Name");
    print_platform_info(platformsIDs[i], CL_PLATFORM_VENDOR, "Vendor");
    std::cout << std::endl;
  }
}

Aunque el objetivo de esta funci�n es imprimir los dispositivos y la plataforma asociada a un contexto, en realidad se est� pidiendo estos datos por par�metros. La idea es que antes de usar estos datos para crear el contexto, se los pase a esta funci�n para mostrarlos.

void print_context_info(cl_platform_id id,
                        int numDevices,
                        cl_device_id *deviceIDs){
  size_t size;

  std::cout << "Context:" << std::endl;
  std::cout << "Platform selected: " << id << std::endl;

  print_platform_info(id, CL_PLATFORM_NAME, "Name");
  print_platform_info(id, CL_PLATFORM_VENDOR, "Vendor");


  std::cout << "Devices selected: " << std::endl;
  for (int j = 0; j < numDevices; j++){
    std::cout << "ID:" <<  deviceIDs[j] << std::endl;
    print_dev_info(deviceIDs[j], CL_DEVICE_TYPE, "Type");
    print_dev_info(deviceIDs[j], CL_DEVICE_NAME, "Name");
    print_dev_info(deviceIDs[j], CL_DEVICE_VENDOR, "Vendor");
    print_dev_info(deviceIDs[j], CL_DEVICE_VERSION, "Dev. Version");
    print_dev_info(deviceIDs[j], CL_DRIVER_VERSION, "Driver Version");

  }
  std::cout << std::endl; 
}

Obtener el ID de todas las plataformas disponibles.

void all_platforms(cl_platform_id **platform_ids, cl_uint *num_platforms){
  cl_int errNum;
  cl_uint numPlatforms;
  cl_platform_id *platformIDs;

  errNum = clGetPlatformIDs(0, NULL, &numPlatforms);

  if (errNum != CL_SUCCESS) {
    printf("clGetPlatformIDs error");
    exit(1);
  }

  platformIDs = (cl_platform_id *) malloc
    (sizeof(cl_platform_id) * numPlatforms);

  errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL);
  if (errNum != CL_SUCCESS){
    printf("Couldn't get platforms IDs");
    exit(1);
  }

  *num_platforms = numPlatforms;
  *platform_ids = platformIDs;
}

Localizar la plataforma con el nombre dado y devolver su ID.

bool select_platform(const char* name,
                     cl_platform_id* platformIDs,
                     cl_uint numPlatforms,
                     cl_platform_id *selected_id){

  bool founded = false;

  for (cl_uint i = 0; i < numPlatforms; i++){
    std::size_t size;
    clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, 0, NULL, &size);

    char *info = (char*) alloca( sizeof(char) * size );
    clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, size, info, NULL);

    if (strcmp(info, name) == 0){
      *selected_id = platformIDs[i];
      founded = true;
    }
  }

  return founded;
}

Buscar los dispositivos que sean parte de la plataforma seleccionada y de un tipo dado.

/**
 Search for devices through all the platforms.
 */
void select_devices(cl_platform_id selected_platform,
                    cl_device_type type,
                    cl_device_id **device_ids,
                    cl_uint* num_devices,
                    cl_uint* platformI){
  cl_uint i, numDevices;
  cl_int errNum;
  cl_device_id *deviceIDs;

  // Search for the amount of devices in the platform.
  errNum = clGetDeviceIDs(selected_platform, type, 0, NULL, &numDevices);
  *num_devices = numDevices;

  if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){
    std::cerr << "Device not found or error";
    exit(1);
  }else if (numDevices > 0){

    // Get all device IDs from that platform.
    deviceIDs = (cl_device_id *) malloc(sizeof(cl_device_id) * numDevices);
    *device_ids = deviceIDs;

    errNum = clGetDeviceIDs(selected_platform, type,
                            numDevices, &deviceIDs[0], NULL);
    *platformI = i;
  }  
}

Crear un contexto OpenCL asignando la plataforma y los dispositivos dados.

void create_context(cl_platform_id platform_id,
                    cl_device_id *deviceIDs,
                    cl_uint numDevices,
                    cl_context *context){
  cl_int errNum;
  cl_context_properties contextProps[] =
    {
     // Specifies the platform to use in the next element
     // Which platform to use
     CL_CONTEXT_PLATFORM, (cl_context_properties) platform_id,
     0  // 0 ends the context properties array (like C strings)
    };

  *context = clCreateContext(contextProps,
                             // devices attached to this context
                             numDevices, deviceIDs,
                             // Callbacks and error reporting
                             &contextCallback, NULL, &errNum);
  if (errNum != CL_SUCCESS){
    std::cerr << "clCreateContext Error" << std::endl;
    exit(1);
  }
}

Cargar un c�digo OpenCL contenido en un archivo. Devolver el string en src.

void load_code(const char* filename, std::string *src){
  std::ifstream srcFile(filename);

  if (!srcFile.is_open()){
    std::cerr << "Error: convolucion.cl not opened" << std::endl;
    exit(1);
  }

  std::string srcProg(std::istreambuf_iterator<char>(srcFile),
                      (std::istreambuf_iterator<char>()) );

  *src = srcProg;
}

Compilar el c�digo OpenCL. Devolver el programa compilado en program. En caso de errores, reportarlos en la salida de error y salir del programa.

void compile_code(cl_context context,
                  cl_uint numDevices,
                  cl_device_id *deviceIDs,
                  std::string src,
                  cl_program *program){
  cl_int errNum;

  const char* csrc = src.c_str();
  size_t length = src.length();

  *program = clCreateProgramWithSource(context, 1, &csrc, &length, &errNum);

  if (errNum != CL_SUCCESS){
    std::cerr << "CreateProgram error:" << std::endl;
    switch (errNum){
    case CL_INVALID_CONTEXT:
      std::cerr << "Invalid Context" << std::endl;
      break;
    case CL_INVALID_VALUE:
      std::cerr << "Invalid Value" << std::endl;
      std::cerr << "Length: " << length << std::endl;
      std::cerr << "Source: " << std::endl;      
      std::cerr << csrc << std::endl;
      break;
    case CL_OUT_OF_HOST_MEMORY:
      std::cerr << "Out of host memory" << std::endl;
      break;
    default:
      std::cerr << "Unknown error" << std::endl;
    }
    exit(1);
  }

  errNum = clBuildProgram(*program, numDevices, deviceIDs, NULL, NULL, NULL);

  if (errNum != CL_SUCCESS){
    char buildLog[16384];
    clGetProgramBuildInfo(*program, deviceIDs[0], CL_PROGRAM_BUILD_LOG,
                          sizeof(buildLog), buildLog, NULL);
    std::cerr << "Compile error in kernel: " << std::endl;
    std::cerr << buildLog;
    exit(1);
  }
}

Crear un kernel con el programa dado.

void create_kernel(cl_program program,
                   const char* kernel_name,
                   cl_kernel *kernel){
  cl_int errNum;
  *kernel = clCreateKernel(program, kernel_name, &errNum);

  if (errNum != CL_SUCCESS){
    report_create_kernel_err(errNum);
    exit(1);
  }
}

4 Convoluci�n 1D

Se comienza por el programa principal. Se importan las librer�as a utilizar.

#include <cassert>
#include <iostream>
#include <string>
#include <ctime>

#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif

#include "cpu_timer.h"

#include "clhelpers.hpp"

La siguiente funci�n es para escribir en que etapa se est� ejecutando el programa a la salida est�ndar. Se evita un poco de c�digo y se d� un formato que el usuario puede reconocer.

void message(const char* str){
  std::cout << "-> " << str << std::endl;
}

Funciones para inicializar el arreglo inicial con valores aleatorios y el filtro con valores 1.

void init_input(float* input, size_t size){
  for (int i = 0; i < size; i++){
    input[i] = (float) (rand() % 100);
  }
}

void setup_filter(float* filter, size_t size){
  for(int i = 0; i < size; i++) {
    // filter[i] = (float) (rand() % 100);
    filter[i] = 1;
  }
}

El algoritmo de convoluci�n de CPU. �ste es similar al de CUDA.

/* Convolucion en la cpu */
void convolucion_cpu(const float* input, float* output, const float* filter,
                     const int n, const int m) 
{
  /*
   Ayuda: se implementa la convolucion secuencial. 
   Tenga en cuenta que esta es una posible solución muy simple al 
   problema.
  */
  float temp;

  /*
   Barrido del vector input (tamaño N) y para cada elemento j hasta N 
   hago la operacion de convolucion: elemento i del vector filter por
   elemento i+j del vector input.
  */
  for(int j = 0; j < n; j++){   
    temp = 0.0;
    for(int i = 0; i < m; i++){
      temp += filter[i]*input[i+j];
    }
    output[j] = temp;
  }

}

El programa principal debe parsear los primeros argumentos. Se aceptan dos:

N: El tama�o del arreglo.
M: El tama�o del filtro.

int main(int argc, char** argv){

  /* --- Parse parameters --- */
  if (argc < 2){
    printf("Synopsis: ./convolucion_float array_size filter_size");
    exit(1);
  }

  const unsigned int n = atoi(argv[1]); // array size
  const unsigned int m = atoi(argv[2]); // filter size
  const unsigned int size_input = sizeof(cl_float) * (n + m);
  const unsigned int size_filter = sizeof(cl_float) * m;
  const unsigned int size_output = sizeof(cl_float) * n;

Las dimensiones del filtro deben ser menores a 1024 n�meros (cantidad de threads de CUDA, heredado puesto que en las plataformas que tengo el m�ximo de workgroup tambi�n es de 1024). Adem�s, N debe ser m�ltiplo de M.

// chequeo que las dimensiones N y M sean correctas para esta solucion
assert((n % m == 0) && (m < 1024));

Se define una variable errNum para guardar los errores si sucediera en alguna funci�n OpenCL.

cl_int errNum;

4.1 Armar Contexto

Primero se pide todos los IDs de las plataformas. Imprimir en pantalla por si el usuario quiere verlas (usar el comando clinfo para ver m�s detalles).

Se utilizar�n las siguientes variables:

platformIDs: Un arreglo con los IDs de las plataformas.
selected_platform: Contendr� el ID de la plataforma que queremos usar.
numPlatforms: Cantidad de plataformas encontradas (tama�o del arreglo platformIDs).

// Select OpenCL platforms to run on 
cl_platform_id* platformIDs = NULL;
cl_platform_id selected_platform;
cl_uint numPlatforms;
all_platforms(&platformIDs, &numPlatforms);

print_platforms(platformIDs, numPlatforms);
message("Platforms detected");

Luego, se busca la plataforma deseada. En este caso, ``Clover'' es el nombre de la plataforma de MESA. Es posible instalar otras implementaciones de OpenCL y tambi�n pueden utilizarse.

Si la plataforma no existe (errb resulta falso), debe detenerse la ejecuci�n.

// Clover == MESA
std::string platform_name = "Clover";

bool errb = select_platform(platform_name.c_str(), platformIDs, numPlatforms,
                            &selected_platform);

assert(errb);

Solicitamos los dispositivos que la plataforma soporta. Los usaremos todos en el contexto. Podemos pedir CPU y GPU, aunque en este caso pediremos s�lo GPU.

deviceIDs: Un arreglo con los IDs de los dispositivos.
numDevices: La cantidad de dispositivos encontrados (tama�o de deviceIDs).
platform_index: El �ndice de la plataforma seleccionada (platformIDs[platform_index] == selected_platform debe ser verdadero).

// Iterate through the list of platforms until we find one that supports
// a GPU device, otherwise fail with an error.
cl_device_id *deviceIDs = NULL;
cl_uint numDevices;
cl_uint platform_index;
select_devices(selected_platform,
               CL_DEVICE_TYPE_GPU,
               &deviceIDs, &numDevices, &platform_index);

print_context_info(platformIDs[platform_index], numDevices, deviceIDs);

message("Devices detected");

Se crea el contexto con la plataforma seleccionada y los dispositivos encontrados.

// Create OpenCL context for creating buffers
// We need the device and the platform for creating a new context.

// No special properties is going to be used.
cl_context context;
create_context(selected_platform,
               deviceIDs,
               numDevices,
               &context);

message("Context created");

4.2 Cargar C�digo OpenCL

Luego, se carga el c�digo OpenCL del archivo convolucion.cl a memoria.

// Load OpenCL code
string src;
size_t length;
load_code("convolucion.cl", &src);

message("CL Code loaded");

Una vez cargado el programa en src, se debe compilar. La siguiente instrucci�n lo compila y captura cualquier error en el c�digo mostr�ndolo en pantalla.

// Compile OpenCL code
cl_program program;
compile_code(context, numDevices, deviceIDs, src,
             &program);

message("CL Code compiled");

4.3 Crear y ejecutar el kernel

El c�digo compilado debe contener una funci�n kernel, el cual debe ser cargado a memoria. Tambi�n deben setearse los par�metros a utilizar.

Primero, crear el objeto kernel para la funci�n "convolucion_gpu".

// Create kernel object
cl_kernel kernel;
std::string kernel_name = "convolucion_gpu";
create_kernel(program, kernel_name.c_str(), &kernel);

message("Kernel created");

Para poder setear sus par�metros, se debe inicializar el filtro y el arreglo de entrada. Antes que nada, se crea los espacios de memorias que se usar�n en el host.

/* --- Init host data --- */
float *h_input, *h_output, *check_output, *h_filter;
h_input      = (float *) malloc(size_input);
h_output     = (float *) malloc(size_output);
check_output = (float *) malloc(size_output);
h_filter     = (float *) malloc(size_filter);

assert(h_input);
assert(h_output);
assert(check_output);
assert(h_filter);

Se inicializa el filtro y la entrada.

setup_filter(h_filter, m);
init_input(h_input, n+m);

message("filter and input initialized");

En CPU, se ejecuta la convoluci�n para comparar su resultado con el de la salida del paralelo. Aqu� se usa el objeto cpu_timer para tomar el tiempo que tarda y comparar con el del GPU.

// timespec cpustart, cpuend;
cpu_timer cpu_prof;
cpu_prof.tic();

// clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &cpustart);
convolucion_cpu(h_input, check_output, h_filter, n, m);
// clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &cpuend);

cpu_prof.tac();

std::cout << "CPU Profiling: " << cpu_prof.elapsed() << " msecs" << std::endl;

Para enviar los datos a la GPU, es necesario reservar la memoria indicando la cantidad. Esto se debe hacer para el filtro, la entrada y la salida. OpenCL ofrece la posibilidad de reservar el espacio y copiar datos o inicializarlos con un valor al mismo tiempo.

Aqu� se crea lo siguiente:

d_input: Un espacio de s�lo lectura del tama�o de la entrada y que lo inicializar� con los datos de h_input.
d_output: Un espacio de s�lo escritura del tama�o de la salida. Se inicializar� en NULL.
d_filter: En espacio de s�lo lectura inicializado con los datos de h_filter.

/* --- Allocate buffers --- */
cl_mem d_input, d_output, d_filter;
d_input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                         size_input, static_cast<void *>(h_input),
                         &errNum);
d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                          size_output, NULL,
                          &errNum);
d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                          size_filter, static_cast<void *>(h_filter),
                          &errNum);

Ahora s�, se posee todo lo necesario para setear los argumentos al kernel. Se espera que la funci�n convolucion_gpu tenga cinco par�metros:

El arreglo de entrada.
El arreglo de salida.
El filtro.
Tama�o del arreglo de entrada.
Tama�o del filtro.

clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_input);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_output);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter);  
clSetKernelArg(kernel, 3, sizeof(cl_uint), &n);
clSetKernelArg(kernel, 4, sizeof(cl_uint), &m);

4.4 Crear la Cola de Comandos

OpenCL ofrece la posibilidad de ejecutar varios kernels de forma ordenada (o desordenada si se lo configura para ello).

El siguiente snippet crea una cola de comandos dentro del contexto. Se activa el profiling para tomar los tiempos de GPU.

cl_command_queue queue = clCreateCommandQueue(context, deviceIDs[0],
                                              // 0, // No Profile
                                              CL_QUEUE_PROFILING_ENABLE,
                                              &errNum);


message("Command queue created");

Observar que clCreateCommandQueue est� en deprecated para OpenCL 1.1 en adelante, por lo que al compilar surgir� una advertencia.

Para agregar al kernel a la cola, se debe determinar el global work size y el local work size. Es lo an�logo a grid size y block size en CUDA. La diferencia con CUDA es que en OpenCL se debe especificar el total de global works/.

Por ejemplo, en CUDA se especific� que el tama�o del bloque sea de M threads y la grilla de N/M bloques, dando un total de N threads. En cambio, en OpenCL se indica un global work size de N threads totales repartidos en local works de M threads, dejando N/M local works por global work.

// Queue the kernel for execution across the array
// Difference between CUDA and OpenCL: globalWorkSize is the total.
const size_t globalWorkSize[1] = { n };
const size_t localWorkSize[1] = { m };

std::cout << "global work size = " << globalWorkSize[0] << std::endl;
std::cout << "local work size = " << localWorkSize[0] << std::endl;

cl_event prof_event; // Profiling

clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
                       globalWorkSize, localWorkSize,
                       0, NULL, &prof_event);

Para poder realizar el profiling de la funci�n kernel, se debe esperar a que la misma termine. Adem�s, hay que indicarle al evento que termine en el ciclo del reloj inmediato que pueda hacerlo.

clFinish(queue); // Profiling: Ensure the queue is finished
clWaitForEvents(1, &prof_event); // Profiling: wait for the event to tic

4.5 Mostrar Informaci�n del Profiling

Obtener la informaci�n del evento de profiling desde la GPU.

// Return profiling info
cl_ulong ev_start_time = (cl_ulong) 0;
cl_ulong ev_end_time = (cl_ulong) 0;
size_t ev_size;
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_QUEUED,
                        sizeof(cl_ulong), &ev_start_time, &ev_size);
clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END,
                        sizeof(cl_ulong), &ev_end_time, &ev_size);

Los tiempos retornados son en nanosegundos. Se calcula los milisegundos y segundos del tiempo entre que empez� y termin� de ejecutar el kernel. Se imprime todo en pantalla.

double run_time = (double) (ev_end_time - ev_start_time);
std::cout << "Kernel profiling: " << run_time * 1.0e-9 << " seconds";
std::cout << " = " << run_time * 1.0e-6 << " msecs";
std::cout << " = " << run_time << " nanosecs" << std::endl;

4.6 Obtener los Resultados desde la GPU

Copiar los resultados desde d_output, que est� en GPU, a h_output que est� en memoria del Host. Este comando se realizar� de forma bloqueante por lo que se ejecutar� posterior a terminar la cola de comandos.

errNum = clEnqueueReadBuffer(queue, // command_queue
                             d_output, // buffer
                             CL_TRUE, // blocking_read
                             0, // offset
                             size_output, // cb
                             h_output, // ptr
                             0, NULL, NULL);

if (errNum != CL_SUCCESS){
  std::cout << "Enqueue read buffer error" << std::endl;
  exit(1);
}

4.7 Comparar Resultados

Finalmente, se comparan los resultados dados por la CPU y la GPU. Una buena pr�ctica, no realizada aqu�, ser�a la utilizaci�n de un margen de error en caso de que las precisiones del punto flotante son distinta.

std::cout << "Comparison: --------------------" << std::endl;
std::cout << "Displaying differences" << std::endl;
std::cout << "[i] h_output - check_output" << std::endl;
for (int j=0; j < n; j++){
  if (h_output[j] != check_output[j]){
    std::cout << "[" << j << "] " <<
      h_output[j] << " - " << check_output[j] << std::endl;
    // assert(h_output[j] == check_output[j]);
  }
}
message("End comparison");

4.8 Liberar Espacios

Liberar lo reservado por malloc.

free(h_input);
free(h_output);
free(check_output);
free(h_filter);

Liberar los objetos de memoria de OpenCL.

clReleaseMemObject(d_input);
clReleaseMemObject(d_output);
clReleaseMemObject(d_filter);

Liberar el contexto. Si esto funciona significa que la cola de comandos y los objetos asignados a este contexto se han liberado apropiadamente.

errNum = clReleaseContext(context);
if (errNum != CL_SUCCESS){
  std::cerr << "Release context error" << std::endl;
}

Indicamos el fin del programa.

  message("Program ended.");
  return 0;
}

4.9 C�digo OpenCL

El c�digo OpenCL es el siguiente. Similar al de CUDA.

__kernel void convolucion_gpu
(
 __constant float const *input,
 __global   float *output,
 __constant float const *filter,
 const int n, // Input size
 const int m // Filter size
 ){
  int j = get_global_id(0);
  // int j = (blockIdx.x * blockDim.x) + threadIdx.x ;

  /*
   Barro vector input (tamaño N) y para cada elemento j hasta N hago la 
   operacion de convolucion: elemento i del vector filter por elemento 
   i+j del vector input.
  */
  if (j < n){
    output[j] = 0.0;
    for(int i = 0; i < m; i++){
      output[j] += filter[i] * input[i+j];
    }
  }
}

4.10 Script para Compilar

Para comipilar se debe ejecutar g++ indicando al linker que se usar� la biblioteca OpenCL.

g++ -o convolucion -lOpenCL main.cpp clhelpers.cpp