SwiftでOpenAIの機能を使う

Last updated at 2024-12-05Posted at 2024-12-05

SwiftでChatGPTのAPIを使う際に、MacPaw/OpenAI という便利なライブラリがありますが、あまり日本語のブログや記事がないので基本的な使い方についてまとめてみました。API KEYは取得できている前提の記事です。知人用に取り急ぎ書いているので、後から詳しい解説を増やしていく予定です。

急いでいる人向け

テキストを渡す

// APIにアクセスするために、取得したAPI KEYを使ってOpenAIクラスのインスタンスopenAIを作成する
let openAI = OpenAI(apiToken: "YOUR_API_KEY")
// リクエストのためのクエリを作成する
let query = CompletionsQuery(model: "gpt-3.5-turbo-instruct", prompt: "YOUR_PROMPT", temperature: 0, maxTokens: 100, topP: 1, frequencyPenalty: 0, presencePenalty: 0, stop: ["\\n"])
// 生成結果を受け取る
let result = try await openAI.completions(query: query)
// 生成テキストを受け取って行いたい処理をする
if let firstChoice = result.choices.first {
    let answer = firstChoice.text
    // 行いたい処理
}

画像データを渡す

// APIにアクセスするために、取得したAPI KEYを使ってOpenAIクラスのインスタンスopenAIを作成する
let openAI = OpenAI(apiToken: "YOUR_API_KEY")
// リクエストのためのクエリを作成する
let chatQuery = ChatQuery(messages: [
                .user(.init(content: .vision([
                    .chatCompletionContentPartTextParam(.init(text: "YOUR_PROMPT")),
                    .chatCompletionContentPartImageParam(.init(imageUrl: .init(url: imageData, detail: .auto)))
                ])))
            ], model: Model.gpt4_o, maxTokens: 50)
// 生成結果を受け取る
let result = try await openAI.chats(query: chatQuery)
// 生成テキストを受け取って行いたい処理をする
if let choice = result.choices.first {
                if case let .string(text) = choice.message.content {
                    // // 行いたい処理
                } else {
                    print("Content is not a string")
                }
} else {
    print("No choices available")
}

テキストをChatGPTに渡す

1. APIにアクセスするために、取得したAPI KEYを使ってOpenAIクラスのインスタンスopenAIを作成する

let openAI = OpenAI(apiToken: "YOUR_TOKEN_HERE")

APIへのRequestを作成するためにCompletionsQueryというデータ構造を用いる。（画像セクションで使う、ChatQueryを用いても良いです。なんなら、公式ドキュメントのexampleではCompletionsQueryを使っていますが、OpenAI的にはChatQuery推奨らしい👀）

struct CompletionsQuery: Codable {
    /// 使いたいモデルのID
    public let model: Model
    /// 投げるプロンプト
    public let prompt: String
    /// モデルの「創造性」を制御するパラメータ。 値が大きい方(0.9など)がリスクのある多様性のある生成が期待できる。値が小さい方(0など)が一貫性のある生成が期待できる。
    public let temperature: Double?
    /// 生成するテキストの最大トークン数
    public let maxTokens: Int?
    /// どのくらいの確率のトークンを考慮するか。0.1は上位10%の確率を持つトークンのみを、1.0は全てのトークンを考慮することを意味する。
    public let topP: Double?
    /// トークンの繰り返しを抑制するペナルティ。-2.0 から 2.0 までの値を取る。正の値だと、モデルが同じ行を繰り返す可能性を低下させる。
    public let presencePenalty: Double?
    /// APIが生成を止めるための条件。["\n", "END"] → 改行や"END"が出たら停止。
    public let stop: [String]?
    /// ユーザを識別するためのID。
    public let user: String?
}

以下はCompletionsQueryのイニシャライザ。Modelとpromptだけ指定してあげればいいことがわかります。

public init(model: Model, prompt: String, temperature: Double? = nil, maxTokens: Int? = nil, topP: Double? = nil, frequencyPenalty: Double? = nil, presencePenalty: Double? = nil, stop: [String]? = nil, user: String? = nil) {
        self.model = model
        self.prompt = prompt
        self.temperature = temperature
        self.maxTokens = maxTokens
        self.topP = topP
        self.frequencyPenalty = frequencyPenalty
        self.presencePenalty = presencePenalty
        self.stop = stop
        self.user = user
    }

2. リクエストのためのクエリを作成する。

let query = CompletionsQuery(model: "gpt-3.5-turbo-instruct", prompt: "日本の首都はどこですか?", temperature: 0, maxTokens: 100, topP: 1, frequencyPenalty: 0, presencePenalty: 0, stop: ["\\n"])

⚠️ OpenAIのプラットフォームによると、公式ドキュメントのexampleで使われているCompletionsモデルは殆どが既に廃止されているので注意。

3. 生成結果を受け取る。

let result = try await openAI.completions(query: query)

if let firstChoice = result.choices.first {
    let answer = firstChoice.text
    // やりたい処理
}

例えばanswerをprintすると、

日本の首都は東京です。

が表示される。
生成結果CompletionsResulは以下のようなデータ構造になっている。ただ単に生成結果を見たければ、result.choices.first.textを見てあげれば良いことがわかる。

struct CompletionsResult: Codable, Equatable {
    public struct Choice: Codable, Equatable {
        public let text: String
        public let index: Int
    }

    public let id: String
    public let object: String
    public let created: TimeInterval
    public let model: Model
    public let choices: [Choice]
    public let usage: Usage
}

画像をChatGPTに渡す

1. APIにアクセスするために、取得したAPI KEYを使ってOpenAIクラスのインスタンスopenAIを作成する

let openAI = OpenAI(apiToken: "YOUR_TOKEN_HERE")

APIへのRequestを作成するためにChatQueryというデータ構造を用いる。

 struct ChatQuery: Codable {
     /// ID of the model to use. Currently, only gpt-3.5-turbo and gpt-3.5-turbo-0301 are supported.
     public let model: Model
     /// The messages to generate chat completions for
     public let messages: [Chat]
     /// A list of functions the model may generate JSON inputs for.
     public let functions: [ChatFunctionDeclaration]?
     /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and  We generally recommend altering this or top_p but not both.
     public let temperature: Double?
     /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
     public let topP: Double?
     /// How many chat completion choices to generate for each input message.
     public let n: Int?
     /// Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
     public let stop: [String]?
     /// The maximum number of tokens to generate in the completion.
     public let maxTokens: Int?
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.
     public let presencePenalty: Double?
     /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
     public let frequencyPenalty: Double?
     ///Modify the likelihood of specified tokens appearing in the completion.
     public let logitBias: [String:Int]?
     /// A unique identifier representing your end-user, which can help OpenAI to monitor and detect abuse.
     public let user: String?
}

2. リクエストのためのクエリを作成する。
最低でもmodelとmessagesだけ指定してあげれば良いことがわかります。
imageDataはjpeg形式であることに気をつけてください。

let chatQuery = ChatQuery(messages: [
                .user(.init(content: .vision([
                    .chatCompletionContentPartTextParam(.init(text: "What's in this image? Answer in only Japanese words.")),
                    .chatCompletionContentPartImageParam(.init(imageUrl: .init(url: imageData, detail: .auto)))
                ])))
            ], model: "gpt-4o", maxTokens: 50)

ここで、messagesの型[ChatCompletionMessageParam]はメッセージの種類（system、user、assistant、tool）に応じてケース分岐があります。

case system(Self.ChatCompletionSystemMessageParam)
case user(Self.ChatCompletionUserMessageParam)
case assistant(Self.ChatCompletionAssistantMessageParam)
case tool(Self.ChatCompletionToolMessageParam)

ChatCompletionUserMessageParamのデータ構造・その中のcontentの構造から画像形式のデータを渡したい場合、ユーザメッセージを作成すれば良いことがわかります。

public struct ChatCompletionUserMessageParam: Codable, Equatable {
    public let content: Content
    public typealias Role = ChatQuery.ChatCompletionMessageParam.Role
    public let role: Self.Role = .user
    public let name: String?
}

public enum Content: Codable, Equatable {
    case string(String)               // テキストデータ
    case vision([VisionContent])      // 画像や複合データ
}

3. 生成結果を受け取る。

let result = try await openAI.chats(query: chatQuery)

if let choice = result.choices.first {
                if case let .string(text) = choice.message.content {
                    print("Content : \(text)") // ここで画像認識結果が表示される
                } else {
                    print("Content is not a string")
                }
            } else {
                print("No choices available")
            }

ご参考までに

ジャガイモの画像を渡してそれが何かを画像認識させる簡単なアプリのリポジトリです。

参考文献

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up