C/C++のスレッド・ライブラリを用いて円周率の近似計算コードを書いてみた。スレッド用関数作ったり各スレッドにタスクを割り当てるのがすごく面倒。
- PThread
- Windows Thread
- C++11 Thread
- Apple GCD (Grand Central Dispatch)
##非並列
// PI近似計算
double compute_pi(const int num_of_partitions)
{
if (num_of_partitions < 1) return -1.;
const double width = 1. / (double)num_of_partitions;
double height = 0.;
double middle = 0.;
double sum = 0.;
double pi = 0.;
int index = 0;
for (index = 0; index < num_of_partitions; index++) {
middle = (index + 0.5) * width;
height = 4. / (1. + middle * middle);
sum += height;
}
pi = width * sum;
return pi;
}
##PThread
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
// スレッド関数パラメータ
typedef struct _thread_func_param
{
double width;
int begin;
int end;
double* sum;
pthread_mutex_t* mutex;
} thread_func_param;
// スレッド関数
void* thread_func(void* arg)
{
thread_func_param* param = (thread_func_param*)arg;
int i = 0;
double height = 0.;
double middle = 0.;
double sum = 0.;
for (i = param->begin; i <= param->end; i++) {
middle = (i + 0.5) * param->width;
height = 4. / (1. + middle * middle);
sum += height;
}
pthread_mutex_lock(param->mutex);
*(param->sum) += sum;
pthread_mutex_unlock(param->mutex);
return NULL;
}
// PI近似計算 (Pthread)
double compute_pi_by_pthread(const int num_of_partitions)
{
if (num_of_partitions < 1) return -1.;
// スレッド数=プロセッサ数
int num_of_threads = (int)sysconf(_SC_NPROCESSORS_ONLN);
const double width = 1. / (double)num_of_partitions;
int index = 0;
thread_func_param* params = NULL;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_t* threads = NULL;
double sum = 0.;
double pi = 0.;
pthread_mutex_init(&mutex, NULL);
// タスク割当
params = (thread_func_param*)malloc(sizeof(thread_func_param) * num_of_threads);
for (index = 0; index < num_of_threads; index++) {
params[index].width = width;
params[index].sum = ∑
params[index].mutex = &mutex;
params[index].begin = (index == 0)? 0 : params[index - 1].end + 1;
params[index].end = (index == num_of_partitions - 1)
? num_of_partitions - 1
: (num_of_partitions / num_of_threads) * (index + 1);
}
// スレッド生成
threads = (pthread_t*)malloc(sizeof(pthread_t) * num_of_threads);
for (index = 0; index < num_of_threads; index++) {
pthread_create(&threads[index], NULL, thread_func, (void*)¶ms[index]);
}
// スレッド待機
for (index = 0; index < num_of_threads; index++) {
pthread_join(threads[index], NULL);
}
pthread_mutex_destroy(&mutex);
free(threads);
free(params);
pi = width * sum;
return pi;
}
##Windows Thread
#include <stdio.h>
#include <process.h>
#include <Windows.h>
// スレッド関数パラメータ
typedef struct _thread_func_param
{
double width;
int begin;
int end;
double* sum;
CRITICAL_SECTION* cs;
} thread_func_param;
// スレッド関数
unsigned __stdcall thread_func(void* arg)
{
thread_func_param* param = (thread_func_param*)arg;
int ii = 0;
double height = 0.;
double middle = 0.;
double sum = 0.;
for (ii = param->begin; ii <= param->end; ii++) {
middle = (ii + 0.5) * param->width;
height = 4. / (1. + middle * middle);
sum += height;
}
EnterCriticalSection(param->cs);
*(param->sum) += sum;
LeaveCriticalSection(param->cs);
return 0;
}
// PI近似計算 (Windows Thread)
double compute_pi_by_winthread(const int num_of_partitions)
{
if (num_of_partitions < 1) return -1.;
// スレッド数=プロセッサ数
SYSTEM_INFO sysinfo;
GetSystemInfo(&sysinfo);
int num_of_threads = (int)sysinfo.dwNumberOfProcessors;
double width = 1. / (double)num_of_partitions;
int index = 0;
thread_func_param* params = NULL;
CRITICAL_SECTION cs;
HANDLE* threads = NULL;
double sum = 0.;
double pi = 0.;
InitializeCriticalSection(&cs);
// タスク割当
params = (thread_func_param*)malloc(sizeof(thread_func_param)* num_of_threads);
for (index = 0; index < num_of_threads; index++) {
params[index].width = width;
params[index].sum = ∑
params[index].cs = &cs;
params[index].begin = (index == 0)? 0 : params[index - 1].end + 1;
params[index].end = (index == num_of_partitions - 1)
? num_of_partitions - 1
: (num_of_partitions / num_of_threads) * (index + 1);
}
// スレッド生成
threads = (HANDLE*)malloc(sizeof(HANDLE)* num_of_threads);
for (index = 0; index < num_of_threads; index++) {
threads[index] = (HANDLE)_beginthreadex(NULL, 0, thread_func, ¶ms[index], 0, NULL);
}
// スレッド待機
for (index = 0; index < num_of_threads; index++) {
HANDLE thread = threads[index];
WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
}
DeleteCriticalSection(&cs);
free(threads);
free(params);
pi = width * sum;
return pi;
}
##C++11 Thread
ラムダ式でスレッド関数が不要な分、少しだけすっきり。
#include <thread>
#include <mutex>
#include <vector>
// PI近似計算 (C++11 std::thread)
double compute_pi_by_cpp11_thread(const int num_of_partitions)
{
if (num_of_partitions < 1) return -1.;
typedef std::shared_ptr<std::thread> type_thread;
typedef std::vector<type_thread> type_threads;
const double width = 1. / static_cast<double>(num_of_partitions);
const unsigned int num_of_threads = std::thread::hardware_concurrency();
// スレッド生成
std::mutex mutex;
double sum = 0.;
int begin = 0;
int end = 0;
type_threads threads;
for (unsigned int index = 0; index < num_of_threads; index++) {
// タスク割当
begin = (index == 0)? 0 : end + 1;
end = (index == num_of_partitions - 1)
? num_of_partitions - 1
: (num_of_partitions / num_of_threads) * (index + 1);
// スレッド生成
threads.push_back(type_thread(new std::thread([&sum, &mutex, width, begin, end](){
double partial_sum = 0.;
for (int i = begin; i <= end; i++) {
double middle = (i + 0.5) * width;
double height = 4. / (1. + middle * middle);
partial_sum += height;
}
mutex.lock();
sum += partial_sum;
mutex.unlock();
})));
}
// スレッド待機
std::for_each(threads.begin(), threads.end(), [](const type_thread& thread){
thread->join();
});
double pi = width * sum;
return pi;
}
##Apple GCD
これもC言語の機能拡張Blocksのおかげでスレッド関数を書かなくてよい。
#include <dispatch/dispatch.h>
#include <stdlib.h>
#include <unistd.h>
// PI近似計算 (Apple GCD)
double compute_pi_by_gcd(const int num_of_partitions)
{
if (num_of_partitions < 1) return -1.;
// スレッド数=プロセッサ数
size_t num_of_threads = (size_t)sysconf(_SC_NPROCESSORS_ONLN);
const double width = 1. / (double)num_of_partitions;
size_t index = 0;
size_t* begin = NULL;
size_t* end = NULL;
__block double sum = 0.;
double pi = 0.;
dispatch_semaphore_t semaphore = dispatch_semaphore_create(1);
// タスク割当
begin = (size_t*)malloc(sizeof(size_t) * num_of_threads);
end = (size_t*)malloc(sizeof(size_t) * num_of_threads);
for (index = 0; index < num_of_threads; index++) {
begin[index] = (index == 0)? 0 : end[index - 1] + 1;
end[index] = (index == num_of_partitions - 1)
? num_of_partitions - 1
: (num_of_partitions / num_of_threads) * (index + 1);
}
// スレッド開始
dispatch_apply(num_of_threads,
dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
^(size_t index){
size_t i = 0;
double height = 0.;
double middle = 0.;
double partional_sum = 0.;
for (i= begin[index]; i <= end[index]; i++) {
middle = (i + 0.5) * width;
height = 4. / (1. + middle * middle);
partional_sum += height;
}
dispatch_semaphore_wait(semaphore, DISPATCH_TIME_FOREVER);
sum += partional_sum;
dispatch_semaphore_signal(semaphore);
});
dispatch_release(semaphore);
free(begin);
free(end);
pi = width * sum;
return pi;
}