luaでもマルチスレッドやらせろ！！！

Last updated at 2022-12-27Posted at 2022-12-27

　
この記事はアドカレに参加しています。

luaでもマルチスレッドやらせろ！！！

先日、このような記事を書きました。luaでも外部ライブラリを入れることでマルチスレッドできるよ～みたいなことの事前調査みたいなものです。

さて、luaでもマルチスレッドできると分かったので、試してみました。

ところが、外部ライブラリさんがエラーばかり吐いて全く機能してくれない…。

じゃあ、違うものを、ということで別の外部ライブラリを試してみました。が、こちらも動かない…。

…というのを三回ぐらい繰り返したので、自分で実装することにしました。

実装方法

実装方法はいくつか方法があるみたいです。

今回はお試しということで、スレッドごとにlua環境を用意してやることにしました。ここで問題になるのが、変数の共有ですが、ffiでポインタを受け渡すようにしました。とっても理知的な反面、手間がかかります。~~この時点で没案行き~~

dllの実装

なんだかんだでマルチスレッドができるdllを作成することができました。
自作のコミュニティにコードとバイナリがありますので、よければ。

ところで、個人のメモ帳のことをコミュニティと偽るのは大丈夫なのだろうか。

性能実験

気になるのはその性能ですね。
マルチスレッドにすることで、どれだけ速度が良くなるのか。
以下のコードで比較してみました。
(以下のコードはざっくり計測を目標としているので、正確な結果を返しません。予めご了承ください。)

--[[

blur.anm
	マルチスレッドの性能テスト。

]]

--track0:size,1,50,10,1
--track1:type,1,2,1,1

local t1,t2
local d=require("M_Blur_Module")
t1=d.time()--計測開始

if(obj.track1==1)then
---------------------------------------------
---------------------------------------------

local ffi=require"ffi"

pcall(ffi.cdef,[[
    typedef struct Pixel_ {
      uint8_t b,g,r,a;
    } Pixel;
  ]])

local work,data,w,h=obj.getpixeldata"work",obj.getpixeldata()
local c=ffi.cast("Pixel*",data)
local r=ffi.cast("Pixel*",work)

local function f(a)--0~255の範囲で四捨五入する関数。
 if(a<0)then return 0
 elseif(a>255)then return 255
 else
  return math.floor(a+0.5)
 end
end

local t0=obj.track0/3
local ha=math.floor(obj.track0)
local ha2=ha*2+1
local ta={}
local total_ta=0
for i=1,ha2 do--taにガウス関数で重みを付ける
 local s=(i-1)-ha
 ta[i]=math.exp(-(s*s)/(2*t0*t0))/(math.sqrt(2*math.pi)*t0)
 total_ta=total_ta+ta[i]
end

for i=1,ha2 do--taの合計値が1.0になるようにする
 ta[i]=ta[i]/total_ta
end

local qt={}--ぼかす際の変数を入れる
local af=ffi.new("Pixel",{0})

--横方向
for y=0,h-1 do
 local ix=y*w
 local s=-ha

 for i=1,ha2 do
  if(s>=0 and s<w)then
   qt[i]=c[ix+s]
  else
   qt[i]=af
  end
  s=s+1
 end

 for x=0,w-1 do
  local to={0,0,0,0}

  for i=1,ha2 do
   to[1]=to[1]+qt[i].r*ta[i]
   to[2]=to[2]+qt[i].g*ta[i]
   to[3]=to[3]+qt[i].b*ta[i]
   to[4]=to[4]+qt[i].a*ta[i]
  end

  r[ix].r=f(to[1])
  r[ix].g=f(to[2])
  r[ix].b=f(to[3])
  r[ix].a=f(to[4])

  for i=1,ha2-1 do
   qt[i]=qt[i+1]
  end

  if(x+ha+1<w)then
   qt[ha2]=c[ix+ha+1]
  else qt[ha2]=af end
  ix=ix+1
 end
end

--縦方向
for x=0,w-1 do
 local ix=x
 local s=-ha

 for i=1,ha2 do
  if(s>=0 and s<h)then
   qt[i]=r[ix+s*w]
  else
   qt[i]=af
  end
  s=s+1
 end

 for y=0,h-1 do
  local s=-ha
  local to={0,0,0,0}

  for i=1,ha2 do
   to[1]=to[1]+qt[i].r*ta[i]
   to[2]=to[2]+qt[i].g*ta[i]
   to[3]=to[3]+qt[i].b*ta[i]
   to[4]=to[4]+qt[i].a*ta[i]
  end

  c[ix].r=f(to[1])
  c[ix].g=f(to[2])
  c[ix].b=f(to[3])
  c[ix].a=f(to[4])

  for i=1,ha2-1 do
   qt[i]=qt[i+1]
  end

  if(y+ha+1<h)then
   qt[ha2]=r[ix+(ha+1)*w]
  else qt[ha2]=af end
  ix=ix+w
 end
end

obj.putpixeldata(data)

---------------------------------------------
---------------------------------------------

else

---------------------------------------------
---------------------------------------------

local code=[==[

local ffi=require"ffi"

pcall(ffi.cdef,[[
    typedef struct Pixel_ {
      uint8_t b,g,r,a;
    } Pixel;

    typedef struct SETSCRIPTDATA_ {
	int th_num;
      void* data;
	void* work;
	int w;
	int h;
	int track;
    } SETSCRIPTDATA;
  ]])

local THDATA=ffi.cast("SETSCRIPTDATA*",P)

THDATA.th_num=I
local c=ffi.cast("Pixel*",THDATA.data)
local r=ffi.cast("Pixel*",THDATA.work)
local w,h=THDATA.w,THDATA.h
local track0=THDATA.track

local function f(a)--0~255の範囲で四捨五入する関数。
 if(a<0)then return 0
 elseif(a>255)then return 255
 else
  return math.floor(a+0.5)
 end
end

local t0=track0/3
local ha=math.floor(track0)
local ha2=ha*2+1
local ta={}
local total_ta=0
for i=1,ha2 do--taにガウス関数で重みを付ける
 local s=(i-1)-ha
 ta[i]=math.exp(-(s*s)/(2*t0*t0))/(math.sqrt(2*math.pi)*t0)
 total_ta=total_ta+ta[i]
end

for i=1,ha2 do--taの合計値が1.0になるようにする
 ta[i]=ta[i]/total_ta
end

local qt={}--ぼかす際の変数を入れる
local af=ffi.new("Pixel",{0})

local start_h=math.floor(h/N)*I
local h_amari=h%N
local finish_h=start_h+math.floor(h/N)
if(I==N-1)then finish_h=finish_h+h_amari end

--横方向
for y=start_h,finish_h-1 do
 local ix=y*w
 local s=-ha

 for i=1,ha2 do
  if(s>=0 and s<w)then
   qt[i]=c[ix+s]
  else
   qt[i]=af
  end
  s=s+1
 end

 for x=0,w-1 do
  local to={0,0,0,0}

  for i=1,ha2 do
   to[1]=to[1]+qt[i].r*ta[i]
   to[2]=to[2]+qt[i].g*ta[i]
   to[3]=to[3]+qt[i].b*ta[i]
   to[4]=to[4]+qt[i].a*ta[i]
  end

  r[ix].r=f(to[1])
  r[ix].g=f(to[2])
  r[ix].b=f(to[3])
  r[ix].a=f(to[4])

  for i=1,ha2-1 do
   qt[i]=qt[i+1]
  end

  if(x+ha+1<w)then
   qt[ha2]=c[ix+ha+1]
  else qt[ha2]=af end
  ix=ix+1
 end
end

local start_w=math.floor(w/N)*I
local w_amari=w%N
local finish_w=start_w+math.floor(w/N)
if(I==N-1)then finish_w=finish_w+w_amari end

--縦方向
for x=start_w,finish_w-1 do
 local ix=x
 local s=-ha

 for i=1,ha2 do
  if(s>=0 and s<h)then
   qt[i]=r[ix+s*w]
  else
   qt[i]=af
  end
  s=s+1
 end

 for y=0,h-1 do
  local s=-ha
  local to={0,0,0,0}

  for i=1,ha2 do
   to[1]=to[1]+qt[i].r*ta[i]
   to[2]=to[2]+qt[i].g*ta[i]
   to[3]=to[3]+qt[i].b*ta[i]
   to[4]=to[4]+qt[i].a*ta[i]
  end

  c[ix].r=f(to[1])
  c[ix].g=f(to[2])
  c[ix].b=f(to[3])
  c[ix].a=f(to[4])

  for i=1,ha2-1 do
   qt[i]=qt[i+1]
  end

  if(y+ha+1<h)then
   qt[ha2]=r[ix+(ha+1)*w]
  else qt[ha2]=af end
  ix=ix+w
 end
end

]==]

local ffi=require"ffi"
local d=require"M_threads_Module"

local work,data,w,h=obj.getpixeldata"work",obj.getpixeldata()

pcall(ffi.cdef,[[
    typedef struct SETSCRIPTDATA_ {
	int th_num;
      void* data;
	void* work;
	int w;
	int h;
	int track;
    } SETSCRIPTDATA;
  ]])

local thread_n=math.max(1,d.s())
local a={}
local ap={}

for i=1,thread_n do
a[i]=ffi.new("SETSCRIPTDATA[1]")
a[i][0].data=data
a[i][0].work=work
a[i][0].w=w
a[i][0].h=h
a[i][0].track=obj.track0
ap[i]=a[i]
end

local er=d.c(thread_n,code,ap)

for i=1,thread_n do
local num=tonumber(a[i][0].th_num)
--debug_print(string.format("thread_num:%d,error:%d",num,er[num+1]))
end

obj.putpixeldata(data)

---------------------------------------------
---------------------------------------------
end


t2=d.time()--計測終了
local TIME=t2-t1

local data,w,h=obj.getpixeldata()

debug_print(string.format("%f[ms],w=%5f,h=%5f",TIME,w,h))
obj.draw()

結果

処理時間、画像の横幅、画像の縦幅の順に結果を出力しています。

まずはマルチスレッドなしver.

[23:56:54]      10.666900[ms],w=192.000000,h=108.000000
[23:56:58]      23.735600[ms],w=283.000000,h=159.000000
[23:56:59]      35.898300[ms],w=374.000000,h=210.000000
[23:56:59]      55.846200[ms],w=465.000000,h=261.000000
[23:56:59]      78.312600[ms],w=556.000000,h=313.000000
[23:56:59]      115.052900[ms],w=647.000000,h=364.000000
[23:56:59]      139.841100[ms],w=738.000000,h=415.000000
[23:56:59]      173.765900[ms],w=828.000000,h=466.000000
[23:57:00]      220.490800[ms],w=919.000000,h=517.000000
[23:57:00]      269.360700[ms],w=1010.000000,h=568.000000
[23:57:00]      312.319700[ms],w=1101.000000,h=619.000000
[23:57:01]      373.860800[ms],w=1192.000000,h=671.000000
[23:57:01]      425.744100[ms],w=1283.000000,h=722.000000
[23:57:02]      487.071700[ms],w=1374.000000,h=773.000000
[23:57:02]      551.982500[ms],w=1465.000000,h=824.000000
[23:57:03]      623.929800[ms],w=1556.000000,h=875.000000
[23:57:04]      691.688200[ms],w=1647.000000,h=926.000000
[23:57:04]      768.495500[ms],w=1738.000000,h=978.000000
[23:57:05]      853.336300[ms],w=1829.000000,h=1029.000000
[23:57:07]      2139.830600[ms],w=1920.000000,h=1080.000000

次にマルチスレッドver.

[23:59:08]      9.241500[ms],w=192.000000,h=108.000000
[23:59:11]      12.451600[ms],w=283.000000,h=159.000000
[23:59:12]      15.730300[ms],w=374.000000,h=210.000000
[23:59:12]      33.719700[ms],w=465.000000,h=261.000000
[23:59:12]      43.787800[ms],w=556.000000,h=313.000000
[23:59:12]      49.983100[ms],w=647.000000,h=364.000000
[23:59:12]      61.831800[ms],w=738.000000,h=415.000000
[23:59:12]      73.344900[ms],w=828.000000,h=466.000000
[23:59:12]      91.863800[ms],w=919.000000,h=517.000000
[23:59:12]      108.050100[ms],w=1010.000000,h=568.000000
[23:59:13]      124.576600[ms],w=1101.000000,h=619.000000
[23:59:13]      147.918700[ms],w=1192.000000,h=671.000000
[23:59:13]      164.971000[ms],w=1283.000000,h=722.000000
[23:59:13]      125.039100[ms],w=1374.000000,h=773.000000
[23:59:13]      220.602900[ms],w=1465.000000,h=824.000000
[23:59:13]      160.000400[ms],w=1556.000000,h=875.000000
[23:59:14]      225.907000[ms],w=1647.000000,h=926.000000
[23:59:14]      203.006700[ms],w=1738.000000,h=978.000000
[23:59:14]      232.062900[ms],w=1829.000000,h=1029.000000
[23:59:15]      265.403500[ms],w=1920.000000,h=1080.000000

ちゃんと処理速度が速くなっていて安心ですね。

おわりに

luaでマルチスレッドやってみました。
変数共有とデバッグが大変ですが、処理速度の向上が見込めますね。
次はコルーチンでやる方法を試してみたいです。

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up