Speech2Speech agent#

  1# You may need to add your working directory to the Python path. To do so, uncomment the following lines of code
  2# import sys
  3# sys.path.append("/Path/to/directory/agentic-framework") # Replace with your directory path
  4
  5# Besser Agentic Framework Multilingual speech-to-speech example agent
  6
  7# imports
  8import logging
  9import base64
 10
 11from baf.core.agent import Agent
 12from baf.core.session import Session
 13from baf.exceptions.logger import logger
 14
 15from baf.nlp.llm.llm_openai_api import LLMOpenAI
 16from baf.nlp.speech2text.openai_speech2text import OpenAISpeech2Text
 17
 18from baf.nlp.speech2text.luxasr_speech2text import LuxASRSpeech2Text
 19from baf.nlp.text2speech.openai_text2speech import OpenAIText2Speech
 20from baf.nlp.text2speech.piper_text2speech import PiperText2Speech
 21
 22from baf.core.file import File
 23from baf.library.transition.events.base_events import ReceiveFileEvent, ReceiveMessageEvent
 24from baf.library.transition.events.base_events import ReceiveJSONEvent
 25
 26from baf.core.processors.audio_language_detection_processor import AudioLanguageDetectionProcessor
 27
 28# Configure the logging module (optional)
 29logger.setLevel(logging.INFO)
 30
 31# Create the agent
 32agent = Agent('Multilingual Speech-to-Speech Agent')
 33
 34# Load agent properties stored in a dedicated file
 35agent.load_properties('config.yaml')
 36
 37# Define the platform your agent will use
 38websocket_platform = agent.use_websocket_platform(use_ui=True)
 39
 40# Define STT and TTS Models
 41stt = OpenAISpeech2Text(agent=agent, model_name="whisper-1", language=
 42"en")
 43stt2 = OpenAISpeech2Text(agent=agent, model_name="gpt-4o-mini-transcribe")
 44tts = OpenAIText2Speech(agent=agent, model_name="gpt-4o-mini-tts", language="en", voice="coral")
 45tts2 = OpenAIText2Speech(agent=agent, model_name="gpt-4o-mini-tts", language="fr", voice="ash")
 46stt_lux = LuxASRSpeech2Text(agent=agent, language="lb")
 47piper = PiperText2Speech(agent, language="lb")
 48
 49# Create the LLM
 50gpt = LLMOpenAI(
 51    agent=agent,
 52    name='gpt-4.1',
 53    parameters={},
 54    num_previous_messages=100,
 55    global_context='You are a helpful assistant. Always match and answer in the language the user is speaking to you. '
 56                   'Keep your answers concise and to the point. Do not use any formatting or bullet points.',
 57)
 58
 59# Define processor (for spoken language recognition)
 60process = AudioLanguageDetectionProcessor(agent=agent, transcription_model=stt2, llm_name='gpt-4.1')
 61
 62# States
 63initial_state = agent.new_state('initial_state', initial=True)
 64awaiting_state = agent.new_state('awaiting_state') # for awaiting user input
 65sts_state = agent.new_state('sts_message_state')  # for messages and speech
 66sts_file_state = agent.new_state('sts_file_state')  # for audio files uploaded through the UI
 67
 68
 69# STATES BODIES' DEFINITION + TRANSITIONS
 70
 71def initial_body(session: Session):
 72    session.set("user_language", "en")  # Set default user language to English
 73    answer = gpt.predict(
 74        f"You are a helpful assistant. Start the conversation with a short (2-15 words) greetings message. Make it original.")
 75    session.reply(answer)
 76
 77initial_state.set_body(initial_body)
 78initial_state.go_to(awaiting_state)
 79
 80def awaiting_body(session:Session):
 81    pass
 82
 83awaiting_state.set_body(awaiting_body)
 84awaiting_state.when_file_received(allowed_types=("audio/wav", "audio/mpeg", "audio/mp4", "text/plain")).go_to(
 85    sts_file_state)  # Only Allow Wav, MP3, MP4 files
 86awaiting_state.when_event(ReceiveJSONEvent()).go_to(sts_state)  # when Audio is received through the UI
 87awaiting_state.when_no_intent_matched().go_to(sts_state)
 88
 89def stt_message_body(session: Session):
 90    # only transcribe message if the user spoke
 91    if isinstance(session.event, ReceiveJSONEvent) or isinstance(session.event, ReceiveMessageEvent):
 92        session.reply("User: " + session.event.message)
 93    answer = gpt.chat(session)
 94    websocket_platform.reply_speech(session, answer)
 95    session.reply(answer)
 96
 97
 98sts_state.set_body(stt_message_body)
 99sts_state.go_to(awaiting_state)
100
101
102# Execute when a file is received
103def stt_file_body(session: Session):
104    # get user language
105    lang = session.get("user_language", "en")
106    # access STT system based on language mapping
107    s2t = session._agent._nlp_engine._language_to_speech2text_module[lang]
108    event: ReceiveFileEvent = session.event
109    file: File = event.file
110
111    # Determine MIME type
112    ext = file.name.lower()
113    # do only for text files
114    if ext.endswith(".txt"):
115        mime_type = "text/plain"
116    elif ext.endswith(".wav"):
117        mime_type = "audio/wav"
118    elif ext.endswith(".mp3"):
119        mime_type = "audio/mpeg"
120    elif ext.endswith(".m4a"):
121        mime_type = "audio/mp4"
122    else:
123        mime_type = "application/octet-stream"
124
125    # only when audio files are uploaded
126    if not mime_type == "text/plain":
127
128        # convert file to byte representation
129        base64_content = file._base64
130        # Decode the base64 string into bytes
131        file_bytes = base64.b64decode(base64_content)
132        # add to logger
133        logger.info(f"Successfully decoded {len(file_bytes)} bytes.")
134        # for the LuxASR model, we need to indicate the MIME Type
135        if lang == "lb":
136            # call LuxASR Speech2Text and get transcription
137            text = s2t.speech2text(file_bytes, mime_type)
138        else:
139            text = s2t.speech2text(file_bytes)
140        session.reply("User: " + text)
141        answer = gpt.predict(text)
142        #session.reply(answer)
143        file_text = answer
144    else:
145        # convert file to byte representation
146        base64_content = file._base64
147        # Decode the base64 string into text
148        file_text = base64.b64decode(base64_content).decode('utf-8')
149
150    # call Speech2Text and get transcription
151    session.reply(file_text)
152    websocket_platform.reply_speech(session, file_text)
153
154
155sts_file_state.set_body(stt_file_body)
156sts_file_state.go_to(awaiting_state)
157
158
159if __name__ == '__main__':
160    agent.run()