import { AgentFunction, AgentFunctionInfo } from "graphai";
import { SpeechClient, protos } from "@google-cloud/speech";
import { GraphAINullableText } from "@graphai/agent_utils";
type STTGoogleInputs = {
stream: Buffer;
languageCode?: string;
sampleRateHertz?: number;
encoding?: "FLAC" | "ENCODING_UNSPECIFIED" | "LINEAR16" | "MULAW" | "AMR" | "AMR_WB" | "OGG_OPUS" | "SPEEX_WITH_HEADER_BYTE" | "MP3" | "WEBM_OPUS" | null | undefined;
throwError?: boolean;
};
type STTGoogleConfig = {
apiKey?: string;
verbose?: boolean;
};
type STTGoogleParams = STTGoogleInputs & STTGoogleConfig;
type STTGoogleResult = GraphAINullableText
export const sttGoogleAgent: AgentFunction<STTGoogleParams, STTGoogleResult, STTGoogleInputs, STTGoogleConfig> = async ({ params, namedInputs }) => {
const { stream, languageCode, sampleRateHertz, encoding, throwError } = { ...params, ...namedInputs };
try {
const speechClient = new SpeechClient();
const audioBytes = stream instanceof Buffer ? stream.toString("base64") : stream;
const audio = {
content: audioBytes,
};
const config = {
languageCode: languageCode || "ja-JP",
sampleRateHertz: sampleRateHertz || 48000,
encoding: encoding || "FLAC",
};
const request = {
audio: audio,
config: config,
};
// Detects speech in the audio file
const [response] = await speechClient.recognize(request);
const transcription = response.results
? response.results
.map((result) =>
result.alternatives && result.alternatives.length > 0
? result.alternatives[0].transcript
: "error"
)
.join("\n")
: "error";
return {
text: transcription,
};
} catch (e) {
if (throwError) {
console.error(e);
throw new Error("STT Google Error");
}
return {
error: e,
};
}
};
const sttGoogleAgentInfo: AgentFunctionInfo = {
name: "sttGoogleAgent",
agent: sttGoogleAgent,
mock: sttGoogleAgent,
samples: [
{
inputs: { stream: Buffer.from("dummy audio data") },
params: {
languageCode: "en-US",
sampleRateHertz: 16000,
encoding: "LINEAR16",
},
result: {
text: "This is a sample transcription.",
},
},
],
description: "Google Speech-to-Text agent",
category: ["voice"],
author: "Takuya Matsuda",
repository: "https://github.com/your-repo/graphai-agents/",
license: "MIT",
};
export default sttGoogleAgentInfo;