はじめに
VoiceVoxやCoeiroInkの喋らせ方のPythonの参考例は数多く見つかったけど、Swift
版がなかったので、ここに残しておきます
構造体
VoiceVoxも、声色インクもJSON
を返すので、適宜それをデコード、エンコードする必要があるので、JSONEncoder
/JSONDecoder
でエンコード、デコードする構造体を最初に定義します。
声色インク構造体
struct KoeiroInkMora: Codable {
let phoneme: String
let hira: String
let accent: Int
}
struct KoeiroInkProsody: Codable {
let plain: Array<String>
let detail: Array<Array<KoeiroInkMora>>
}
struct KoeiroInkSynthesis: Codable {
let speakerUuid: String
let styleId: Int
let text: String
let prosodyDetail: Array<Array<KoeiroInkMora>>
var speedScale: Float
let volumeScale: Float
let pitchScale: Float
let intonationScale: Float
let prePhonemeLength: Float
let postPhonemeLength: Float
let outputSamplingRate: Int
}
VoiceVox構造体
struct VoiceVoxMora: Codable {
let text: String
let consonant: String?
let consonant_length: Float?
let vowel: String
let vowel_length: Float
let pitch: Float
}
struct VoiceVoxAccentPhrase: Codable {
let moras: Array<VoiceVoxMora>
let accent: Int
let pause_mora: VoiceVoxMora?
let is_interrogative: Bool?
}
struct VoicVoxAccentQuery: Codable {
let accent_phrases: Array<VoiceVoxAccentPhrase>
var speedScale: Float
let pitchScale: Float
let intonationScale: Float
let volumeScale: Float
let prePhonemeLength: Float
let postPhonemeLength: Float
let outputSamplingRate: Int
let outputStereo: Bool
let kana: String?
}
音声合成
声色インク
Speaker UUIDとStyle IDは、http://localhost:50032/v1/speakers
から取得したものを使います。
let ProsodyURL: String = "http://localhost:50032/v1/estimate_prosody"
let SynthesisURL: String = "http://localhost:50032/v1/synthesis"
private func synthesisKoeiroInk (of title: String, with textToSpeech: String) async {
let url: URL = URL(string: ProsodyURL)!
var request: URLRequest = URLRequest(url: url)
request.httpMethod = "POST"
request.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
let text: Text = await Text(text: textfieldTextToSpeak.stringValue)
let encoder: JSONEncoder = JSONEncoder()
do {
let jsonData: Data = try encoder.encode(text)
request.httpBody = jsonData
let result: (data: Data, resp: URLResponse) = try await session.data(for: request)
let decoder: JSONDecoder = JSONDecoder()
let prosody: KoeiroInkProsody = try decoder.decode(KoeiroInkProsody.self, from: result.data)
if let speaker: Variation = speakersDict[title] {
let synthesis: KoeiroInkSynthesis = KoeiroInkSynthesis(speakerUuid: speaker.speakerUuid, styleId: speaker.styleId, text: textToSpeech, prosodyDetail: prosody.detail, speedScale: 1, volumeScale: 1, pitchScale: 0, intonationScale: 1, prePhonemeLength: 0.1, postPhonemeLength: 0.1, outputSamplingRate: 24000)
let config: URLSessionConfiguration = URLSessionConfiguration.default
config.timeoutIntervalForRequest = 10.0
config.timeoutIntervalForRequest = 10.0
let session: URLSession = URLSession(configuration: config, delegate: self, delegateQueue: OperationQueue.current)
let synthURL = URL(string: SynthesisURL)!
var synthRequest: URLRequest = URLRequest(url: synthURL, timeoutInterval: 10.0)
synthRequest.httpMethod = "POST"
synthRequest.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
let synthJson: Data = try encoder.encode(synthesis)
synthRequest.httpBody = synthJson
let wav: (data: Data, resp: URLResponse) = try await session.data(for: synthRequest)
if let resp: HTTPURLResponse = wav.resp as? HTTPURLResponse, resp.statusCode == 200 {
let wavURL: URL = URL(string:"file://" + NSHomeDirectory())!.appendingPathComponent("Downloads/test").appendingPathExtension("wav")
try wav.data.write(to: wavURL)
}
}
} catch let error {
print(error)
}
}// end func synthesisKoeiroInk
VoiceVox
VoiceVoxは読み上げるテキストを一見POST
で渡しているように見えて、URLクエリーで渡します。じゃあ、HTTPMethodはgetでいいのか?と思うとpostじゃないと怒られます。気持ち悪いですね。
声色インクは、Speaker UUIDとStyle IDの2つを使いましたが、こちらは、話者とスタイルの組み合わせにユニークなIDを振っているのでIDだけの指定で良いです。
let VoiceVoxQueryURL: String = "http://localhost:50021/audio_query"
let VoicVoxSynthesisURL: String = "http://localhost:50021/synthesis"
private func synthesisVoiceVox (of title: String, with textToSpeech: String) async {
let variation: Variation = speakersDict[title]!
let textURLEncoding : String = textToSpeech.addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed)!
var urlString: String = String(format: "\(VoiceVoxQueryURL)?text=%@&speaker=%d", textURLEncoding, variation.styleId)
let url: URL = URL(string: urlString)!
var request: URLRequest = URLRequest(url: url)
request.method = .post
request.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
let query: VoiceVoxAqudioQuery = VoiceVoxAqudioQuery(text: textToSpeech, speaker: variation.styleId)
do {
let json: Data = try JSONEncoder().encode(query)
request.httpBody = json
var config: URLSessionConfiguration = URLSessionConfiguration.default
config.timeoutIntervalForRequest = 30
config.timeoutIntervalForResource = 30
let sessin: URLSession = URLSession(configuration: config)
var result: (data: Data, resp: URLResponse) = try await sessin.data(for: request)
let decoder: JSONDecoder = JSONDecoder()
var audioQuery: VoicVoxAccentQuery = try decoder.decode(VoicVoxAccentQuery.self, from: result.data)
audioQuery.speedScale = 1.0
let jsonVoice: Data = try JSONEncoder().encode(audioQuery)
urlString = String(format: "\(VoicVoxSynthesisURL)?speaker=%d", variation.styleId)
request = URLRequest(url: URL(string: urlString)!)
request.method = .post
request.setValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
request.httpBody = jsonVoice
let wav: (data: Data, resp: URLResponse) = try await sessin.data(for: request)
if let resp: HTTPURLResponse = wav.resp as? HTTPURLResponse, resp.statusCode == 200 {
let wavURL: URL = URL(string:"file://" + NSHomeDirectory())!.appendingPathComponent("Downloads/test").appendingPathExtension("wav")
try wav.data.write(to: wavURL)
}
} catch let error as DecodingError {
print(error.key)
print(error.debugDescription)
print(error.localizedDescription)
} catch let error {
print(error)
}
}// end func synthesisVoiceVox