More than 1 year has passed since last update.

VoiceVoxや声色インクに喋らせてみる

Posted at 2024-01-04

はじめに

VoiceVoxやCoeiroInkの喋らせ方のPythonの参考例は数多く見つかったけど、Swift版がなかったので、ここに残しておきます

構造体

VoiceVoxも、声色インクもJSONを返すので、適宜それをデコード、エンコードする必要があるので、JSONEncoder/JSONDecoderでエンコード、デコードする構造体を最初に定義します。

声色インク構造体

struct KoeiroInkMora: Codable {
	let phoneme: String
	let hira: String
	let accent: Int
}

struct KoeiroInkProsody: Codable {
	let plain: Array<String>
	let detail: Array<Array<KoeiroInkMora>>
}

struct KoeiroInkSynthesis: Codable {
	let speakerUuid: String
	let styleId: Int
	let text: String
	let prosodyDetail: Array<Array<KoeiroInkMora>>
	var speedScale: Float
	let volumeScale: Float
	let pitchScale: Float
	let intonationScale: Float
	let prePhonemeLength: Float
	let postPhonemeLength: Float
	let outputSamplingRate: Int
}

VoiceVox構造体

struct VoiceVoxMora: Codable {
	let text: String
	let consonant: String?
	let consonant_length: Float?
	let vowel: String
	let vowel_length: Float
	let pitch: Float
}

struct VoiceVoxAccentPhrase: Codable {
	let moras: Array<VoiceVoxMora>
	let accent: Int
	let pause_mora: VoiceVoxMora?
	let is_interrogative: Bool?
}

struct VoicVoxAccentQuery: Codable {
	let accent_phrases: Array<VoiceVoxAccentPhrase>
	var speedScale: Float
	let pitchScale: Float
	let intonationScale: Float
	let volumeScale: Float
	let prePhonemeLength: Float
	let postPhonemeLength: Float
	let outputSamplingRate: Int
	let outputStereo: Bool
	let kana: String?
}

音声合成

声色インク

Speaker UUIDとStyle IDは、http://localhost:50032/v1/speakersから取得したものを使います。

let ProsodyURL: String = "http://localhost:50032/v1/estimate_prosody"
let SynthesisURL: String = "http://localhost:50032/v1/synthesis"
private func synthesisKoeiroInk (of title: String, with textToSpeech: String) async {
	let url: URL = URL(string: ProsodyURL)!
	var request: URLRequest = URLRequest(url: url)
	request.httpMethod = "POST"
	request.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
	let text: Text = await Text(text: textfieldTextToSpeak.stringValue)
	let encoder: JSONEncoder = JSONEncoder()
	do {
		let jsonData: Data = try encoder.encode(text)
		request.httpBody = jsonData
		let result: (data: Data, resp: URLResponse) = try await session.data(for: request)
		let decoder: JSONDecoder = JSONDecoder()
		let prosody: KoeiroInkProsody = try decoder.decode(KoeiroInkProsody.self, from: result.data)
		if let speaker: Variation = speakersDict[title] {
			let synthesis: KoeiroInkSynthesis = KoeiroInkSynthesis(speakerUuid: speaker.speakerUuid, styleId: speaker.styleId, text: textToSpeech, prosodyDetail: prosody.detail, speedScale: 1, volumeScale: 1, pitchScale: 0, intonationScale: 1, prePhonemeLength: 0.1, postPhonemeLength: 0.1, outputSamplingRate: 24000)
			let config: URLSessionConfiguration = URLSessionConfiguration.default
			config.timeoutIntervalForRequest = 10.0
			config.timeoutIntervalForRequest = 10.0
			let session: URLSession = URLSession(configuration: config, delegate: self, delegateQueue: OperationQueue.current)
			let synthURL = URL(string: SynthesisURL)!
			var synthRequest: URLRequest = URLRequest(url: synthURL, timeoutInterval: 10.0)
			synthRequest.httpMethod = "POST"
			synthRequest.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
			let synthJson: Data = try encoder.encode(synthesis)
			synthRequest.httpBody = synthJson
			let wav: (data: Data, resp: URLResponse) = try await session.data(for: synthRequest)
			if let resp: HTTPURLResponse = wav.resp as? HTTPURLResponse, resp.statusCode == 200 {
				let wavURL: URL = URL(string:"file://" +  NSHomeDirectory())!.appendingPathComponent("Downloads/test").appendingPathExtension("wav")
				try wav.data.write(to: wavURL)
			}
		}
	} catch let error {
		print(error)
	}
}// end func synthesisKoeiroInk

VoiceVox

VoiceVoxは読み上げるテキストを一見POSTで渡しているように見えて、URLクエリーで渡します。じゃあ、HTTPMethodはgetでいいのか？と思うとpostじゃないと怒られます。気持ち悪いですね。
声色インクは、Speaker UUIDとStyle IDの2つを使いましたが、こちらは、話者とスタイルの組み合わせにユニークなIDを振っているのでIDだけの指定で良いです。

let VoiceVoxQueryURL: String = "http://localhost:50021/audio_query"
let VoicVoxSynthesisURL: String = "http://localhost:50021/synthesis"
private func synthesisVoiceVox (of title: String, with textToSpeech: String) async {
	let variation: Variation = speakersDict[title]!
	let textURLEncoding : String = textToSpeech.addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed)!
	var urlString: String = String(format: "\(VoiceVoxQueryURL)?text=%@&speaker=%d", textURLEncoding, variation.styleId)
	let url: URL = URL(string: urlString)!
	var request: URLRequest = URLRequest(url: url)
	request.method = .post
	request.addValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
	let query: VoiceVoxAqudioQuery = VoiceVoxAqudioQuery(text: textToSpeech, speaker: variation.styleId)
	do {
		let json: Data = try JSONEncoder().encode(query)
		request.httpBody = json
		var config: URLSessionConfiguration = URLSessionConfiguration.default
		config.timeoutIntervalForRequest = 30
		config.timeoutIntervalForResource = 30
		let sessin: URLSession = URLSession(configuration: config)
		var result: (data: Data, resp: URLResponse) = try await sessin.data(for: request)
		let decoder: JSONDecoder = JSONDecoder()
		var audioQuery: VoicVoxAccentQuery = try decoder.decode(VoicVoxAccentQuery.self, from: result.data)
		audioQuery.speedScale = 1.0
		let jsonVoice: Data = try JSONEncoder().encode(audioQuery)
		urlString = String(format: "\(VoicVoxSynthesisURL)?speaker=%d", variation.styleId)
		request = URLRequest(url: URL(string: urlString)!)
		request.method = .post
		request.setValue(ContentTypeJSON, forHTTPHeaderField: ContentTypeKey)
		request.httpBody = jsonVoice
		let wav: (data: Data, resp: URLResponse) = try await sessin.data(for: request)
		if let resp: HTTPURLResponse = wav.resp as? HTTPURLResponse, resp.statusCode == 200 {
			let wavURL: URL = URL(string:"file://" +  NSHomeDirectory())!.appendingPathComponent("Downloads/test").appendingPathExtension("wav")
			try wav.data.write(to: wavURL)
		}
	} catch let error as DecodingError {
		print(error.key)
		print(error.debugDescription)
		print(error.localizedDescription)
	} catch let error {
		print(error)
	}
}// end func synthesisVoiceVox

You get articles that match your needs
You can efficiently read back useful information
You can use dark theme

What you can do with signing up