2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?

Scalable Matrix Extension (SME)Advent Calendar 2024

Day 18

SME日記その17 __arm_new("za")について調べる Part.2

Last updated at Posted at 2025-01-05

SME日記その8 __arm_new("za")について調べるでは,__arm_new("za")について,ドキュメントから調査しましたが,実際にApple Clangにコンパイルさせてみました.

SMEシリーズ

サンプルコード

test_sme.c
#include <stdio.h>
#include <arm_sme.h>

__arm_locally_streaming
__arm_new("za")
void test_arm_new(void)
{
}

int main(int argc, char *argv[])
{
  test_arm_new();
}
clang -O2 -march=armv9-a+sme test_sme.c -o test_sme
% ./test_sme
%
clang -O2 -march=armv9-a+sme -S test_sme.c
test_sme.s
        .section        __TEXT,__text,regular,pure_instructions
        .build_version macos, 15, 0     sdk_version 15, 2
        .globl  _test_arm_new                   ; -- Begin function test_arm_new
        .p2align        2
_test_arm_new:                          ; @test_arm_new
        .cfi_startproc
; %bb.0:
        stp     d15, d14, [sp, #-96]!           ; 16-byte Folded Spill
        stp     d13, d12, [sp, #16]             ; 16-byte Folded Spill
        stp     d11, d10, [sp, #32]             ; 16-byte Folded Spill
        stp     d9, d8, [sp, #48]               ; 16-byte Folded Spill
        stp     x20, x19, [sp, #64]             ; 16-byte Folded Spill
        stp     x29, x30, [sp, #80]             ; 16-byte Folded Spill
        add     x29, sp, #80
        sub     sp, sp, #16
        .cfi_def_cfa w29, 16
        .cfi_offset w30, -8
        .cfi_offset w29, -16
        .cfi_offset w19, -24
        .cfi_offset w20, -32
        .cfi_offset b8, -40
        .cfi_offset b9, -48
        .cfi_offset b10, -56
        .cfi_offset b11, -64
        .cfi_offset b12, -72
        .cfi_offset b13, -80
        .cfi_offset b14, -88
        .cfi_offset b15, -96
        smstart sm
        rdsvl   x8, #1
        mul     x8, x8, x8
        mov     x9, x8
Lloh0:
        adrp    x16, ___chkstk_darwin@GOTPAGE
Lloh1:
        ldr     x16, [x16, ___chkstk_darwin@GOTPAGEOFF]
        blr     x16
        mov     x9, sp
        add     x8, x8, #15
        and     x8, x8, #0xfffffffffffffff0
        sub     x8, x9, x8
        mov     sp, x8
        stur    wzr, [x29, #-84]
        sturh   wzr, [x29, #-86]
        stur    x8, [x29, #-96]
        mrs     x8, TPIDR2_EL0
        cbz     x8, LBB0_2
; %bb.1:
        bl      ___arm_tpidr2_save
        msr     TPIDR2_EL0, xzr
LBB0_2:
        smstart za
        zero    {za}
        smstop  za
        smstop  sm
        sub     sp, x29, #80
        ldp     x29, x30, [sp, #80]             ; 16-byte Folded Reload
        ldp     x20, x19, [sp, #64]             ; 16-byte Folded Reload
        ldp     d9, d8, [sp, #48]               ; 16-byte Folded Reload
        ldp     d11, d10, [sp, #32]             ; 16-byte Folded Reload
        ldp     d13, d12, [sp, #16]             ; 16-byte Folded Reload
        ldp     d15, d14, [sp], #96             ; 16-byte Folded Reload
        ret
        .loh AdrpLdrGot Lloh0, Lloh1
        .cfi_endproc
                                        ; -- End function
        .globl  _main                           ; -- Begin function main
        .p2align        2
_main:                                  ; @main
        .cfi_startproc
; %bb.0:
        mov     w0, #0                          ; =0x0
        ret
        .cfi_endproc
                                        ; -- End function
.subsections_via_symbols

最適化されてしまって,実際には関数 test_arm_new を実行していないみたいですね.

考察

ドキュメントと比較してみます.

  • 関数は、遅延保存された ZA データをコミットします。

下記の部分が該当しそうです.

        stp     d15, d14, [sp, #-96]!           ; 16-byte Folded Spill
        stp     d13, d12, [sp, #16]             ; 16-byte Folded Spill
        stp     d11, d10, [sp, #32]             ; 16-byte Folded Spill
        stp     d9, d8, [sp, #48]               ; 16-byte Folded Spill
        stp     x20, x19, [sp, #64]             ; 16-byte Folded Spill
...
        ldp     x20, x19, [sp, #64]             ; 16-byte Folded Reload
        ldp     d9, d8, [sp, #48]               ; 16-byte Folded Reload
        ldp     d11, d10, [sp, #32]             ; 16-byte Folded Reload
        ldp     d13, d12, [sp, #16]             ; 16-byte Folded Reload
        ldp     d15, d14, [sp], #96             ; 16-byte Folded Reload
  • 関数は新しい ZA コンテキストを作成し、PSTATE.ZA を有効にします。

下記の部分が該当しそうです.

        smstart sm
        rdsvl   x8, #1
        mul     x8, x8, x8
        mov     x9, x8
Lloh0:
        adrp    x16, ___chkstk_darwin@GOTPAGE
Lloh1:
        ldr     x16, [x16, ___chkstk_darwin@GOTPAGEOFF]
        blr     x16
        mov     x9, sp
        add     x8, x8, #15
        and     x8, x8, #0xfffffffffffffff0
        sub     x8, x9, x8
        mov     sp, x8
        stur    wzr, [x29, #-84]
        sturh   wzr, [x29, #-86]
        stur    x8, [x29, #-96]
        mrs     x8, TPIDR2_EL0
        cbz     x8, LBB0_2
; %bb.1:
        bl      ___arm_tpidr2_save
        msr     TPIDR2_EL0, xzr
LBB0_2:
        smstart za
        zero    {za}
  • 関数は、戻る前に PSTATE.ZA を無効にします (0 に設定)。

下記の部分が該当しそうです.

        smstop  za
        smstop  sm

ひとまず,__arm_new("za")をApple Clangで使うことができそうで,良かったです.

2
0
0

Register as a new user and use Qiita more conveniently

  1. You get articles that match your needs
  2. You can efficiently read back useful information
  3. You can use dark theme
What you can do with signing up
2
0

Delete article

Deleted articles cannot be recovered.

Draft of this article would be also deleted.

Are you sure you want to delete this article?