This tutorial shows how to run multiple instances of a conjugate gradient solver, one instance per GPU.
We start with including the necessary headers: 
#ifndef VIENNACL_WITH_OPENCL
  #define VIENNACL_WITH_OPENCL
#endif
#include <iostream>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include <boost/numeric/ublas/operation_sparse.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#define VIENNACL_WITH_UBLAS 1
using namespace boost::numeric;
 This tutorial uses Boost.Thread for threading. Other threading approaches (e.g. pthreads) also work. 
#include <boost/thread.hpp>
 This functor represents the work carried out in each thread. It creates the necessary objects, loads the data, and executes the CG solver. 
template<typename NumericT>
class worker
{
public:
  worker(std::size_t tid) : thread_id_(tid) {}
 The functor interface, entry point for each thread. 
 Set up some ublas objects 
ublas::vector<NumericT> rhs;
ublas::vector<NumericT> ref_result;
ublas::compressed_matrix<NumericT> ublas_matrix;
 Read system from file. You may also assemble everything on the fly here. 
{
  std::cout << "Error reading Matrix file" << std::endl;
  return;
}
{
  std::cout << "Error reading RHS file" << std::endl;
  return;
}
{
  std::cout << "Error reading Result file" << std::endl;
  return;
}
  Set up some ViennaCL objects in the respective context. It is important to place the objects in the correct context (associated with each thread) 
std::size_t vcl_size = rhs.size();
viennacl::copy(ref_result.begin(), ref_result.end(), vcl_ref_result.begin());
 
  Transfer ublas-matrix to ViennaCL objects sitting on the GPU: 
    std::stringstream ss;
    ss << 
"Result of thread " << thread_id_ << 
" on device " << 
viennacl::ocl::get_context(static_cast<long>(thread_id_)).
devices()[0].name() << 
": " << vcl_result[0] << 
", should: " << ref_result[0] << std::endl;
    message_ = ss.str();
  }
  std::string message() const { return message_; }
private:
  std::string message_;
  std::size_t thread_id_;
};
  In the main routine we create two OpenCL contexts and then use one thread per context to run the CG solver in the functor defined above. 
{
  
  {
    std::cerr << "Error: No platform found!" << std::endl;
    return EXIT_FAILURE;
  }
  Part 1: Setup first device for first context, second device for second context: 
std::vector<viennacl::ocl::device> 
const & devices = pf.
devices();
if (devices.size() > 1)
else
  Part 2: Now let two threads operate on two GPUs in parallel, each running a CG solver 
worker<ScalarType> work_functor0(0);
worker<ScalarType> work_functor1(1);
boost::thread worker_thread_0(boost::ref(work_functor0));
boost::thread worker_thread_1(boost::ref(work_functor1));
worker_thread_0.join();
worker_thread_1.join();
std::cout << work_functor0.message() << std::endl;
std::cout << work_functor1.message() << std::endl;
 That's it. Print a success message and exit. 
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
  return EXIT_SUCCESS;
}
Full Example Code
#ifndef VIENNACL_WITH_OPENCL
  #define VIENNACL_WITH_OPENCL
#endif
#include <iostream>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/triangular.hpp>
#include <boost/numeric/ublas/matrix_sparse.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/operation.hpp>
#include <boost/numeric/ublas/operation_sparse.hpp>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/lu.hpp>
#define VIENNACL_WITH_UBLAS 1
using namespace boost::numeric;
#include <boost/thread.hpp>
template<typename NumericT>
class worker
{
public:
  worker(std::size_t tid) : thread_id_(tid) {}
  void operator()()
  {
    ublas::vector<NumericT> rhs;
    ublas::vector<NumericT> ref_result;
    ublas::compressed_matrix<NumericT> ublas_matrix;
    {
      std::cout << "Error reading Matrix file" << std::endl;
      return;
    }
    {
      std::cout << "Error reading RHS file" << std::endl;
      return;
    }
    {
      std::cout << "Error reading Result file" << std::endl;
      return;
    }
    std::size_t vcl_size = rhs.size();
    std::stringstream ss;
    ss << 
"Result of thread " << thread_id_ << 
" on device " << 
viennacl::ocl::get_context(static_cast<long>(thread_id_)).
devices()[0].name() << 
": " << vcl_result[0] << 
", should: " << ref_result[0] << std::endl;
    message_ = ss.str();
  }
  std::string message() const { return message_; }
private:
  std::string message_;
  std::size_t thread_id_;
};
{
  
  {
    std::cerr << "Error: No platform found!" << std::endl;
    return EXIT_FAILURE;
  }
  std::vector<viennacl::ocl::device> 
const & devices = pf.
devices();
  
  
  if (devices.size() > 1)
  else
  worker<ScalarType> work_functor0(0);
  worker<ScalarType> work_functor1(1);
  boost::thread worker_thread_0(boost::ref(work_functor0));
  boost::thread worker_thread_1(boost::ref(work_functor1));
  worker_thread_0.join();
  worker_thread_1.join();
  std::cout << work_functor0.message() << std::endl;
  std::cout << work_functor1.message() << std::endl;
  std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl;
  return EXIT_SUCCESS;
}