1# You may need to add your working directory to the Python path. To do so, uncomment the following lines of code
2# import sys
3# sys.path.append("/Path/to/directory/agentic-framework") # Replace with your directory path
4
5# Besser Agentic Framework Multilingual speech-to-speech example agent
6
7# imports
8import logging
9import base64
10
11from baf.core.agent import Agent
12from baf.core.session import Session
13from baf.exceptions.logger import logger
14
15from baf.nlp.llm.llm_openai_api import LLMOpenAI
16from baf.nlp.speech2text.openai_speech2text import OpenAISpeech2Text
17
18from baf.nlp.speech2text.luxasr_speech2text import LuxASRSpeech2Text
19from baf.nlp.text2speech.openai_text2speech import OpenAIText2Speech
20from baf.nlp.text2speech.piper_text2speech import PiperText2Speech
21
22from baf.core.file import File
23from baf.library.transition.events.base_events import ReceiveFileEvent, ReceiveMessageEvent
24from baf.library.transition.events.base_events import ReceiveJSONEvent
25
26from baf.core.processors.audio_language_detection_processor import AudioLanguageDetectionProcessor
27
28# Configure the logging module (optional)
29logger.setLevel(logging.INFO)
30
31# Create the agent
32agent = Agent('Multilingual Speech-to-Speech Agent')
33
34# Load agent properties stored in a dedicated file
35agent.load_properties('config.yaml')
36
37# Define the platform your agent will use
38websocket_platform = agent.use_websocket_platform(use_ui=True)
39
40# Define STT and TTS Models
41stt = OpenAISpeech2Text(agent=agent, model_name="whisper-1", language=
42"en")
43stt2 = OpenAISpeech2Text(agent=agent, model_name="gpt-4o-mini-transcribe")
44tts = OpenAIText2Speech(agent=agent, model_name="gpt-4o-mini-tts", language="en", voice="coral")
45tts2 = OpenAIText2Speech(agent=agent, model_name="gpt-4o-mini-tts", language="fr", voice="ash")
46stt_lux = LuxASRSpeech2Text(agent=agent, language="lb")
47piper = PiperText2Speech(agent, language="lb")
48
49# Create the LLM
50gpt = LLMOpenAI(
51 agent=agent,
52 name='gpt-4.1',
53 parameters={},
54 num_previous_messages=100,
55 global_context='You are a helpful assistant. Always match and answer in the language the user is speaking to you. '
56 'Keep your answers concise and to the point. Do not use any formatting or bullet points.',
57)
58
59# Define processor (for spoken language recognition)
60process = AudioLanguageDetectionProcessor(agent=agent, transcription_model=stt2, llm_name='gpt-4.1')
61
62# States
63initial_state = agent.new_state('initial_state', initial=True)
64awaiting_state = agent.new_state('awaiting_state') # for awaiting user input
65sts_state = agent.new_state('sts_message_state') # for messages and speech
66sts_file_state = agent.new_state('sts_file_state') # for audio files uploaded through the UI
67
68
69# STATES BODIES' DEFINITION + TRANSITIONS
70
71def initial_body(session: Session):
72 session.set("user_language", "en") # Set default user language to English
73 answer = gpt.predict(
74 f"You are a helpful assistant. Start the conversation with a short (2-15 words) greetings message. Make it original.")
75 session.reply(answer)
76
77initial_state.set_body(initial_body)
78initial_state.go_to(awaiting_state)
79
80def awaiting_body(session:Session):
81 pass
82
83awaiting_state.set_body(awaiting_body)
84awaiting_state.when_file_received(allowed_types=("audio/wav", "audio/mpeg", "audio/mp4", "text/plain")).go_to(
85 sts_file_state) # Only Allow Wav, MP3, MP4 files
86awaiting_state.when_event(ReceiveJSONEvent()).go_to(sts_state) # when Audio is received through the UI
87awaiting_state.when_no_intent_matched().go_to(sts_state)
88
89def stt_message_body(session: Session):
90 # only transcribe message if the user spoke
91 if isinstance(session.event, ReceiveJSONEvent) or isinstance(session.event, ReceiveMessageEvent):
92 session.reply("User: " + session.event.message)
93 answer = gpt.chat(session)
94 websocket_platform.reply_speech(session, answer)
95 session.reply(answer)
96
97
98sts_state.set_body(stt_message_body)
99sts_state.go_to(awaiting_state)
100
101
102# Execute when a file is received
103def stt_file_body(session: Session):
104 # get user language
105 lang = session.get("user_language", "en")
106 # access STT system based on language mapping
107 s2t = session._agent._nlp_engine._language_to_speech2text_module[lang]
108 event: ReceiveFileEvent = session.event
109 file: File = event.file
110
111 # Determine MIME type
112 ext = file.name.lower()
113 # do only for text files
114 if ext.endswith(".txt"):
115 mime_type = "text/plain"
116 elif ext.endswith(".wav"):
117 mime_type = "audio/wav"
118 elif ext.endswith(".mp3"):
119 mime_type = "audio/mpeg"
120 elif ext.endswith(".m4a"):
121 mime_type = "audio/mp4"
122 else:
123 mime_type = "application/octet-stream"
124
125 # only when audio files are uploaded
126 if not mime_type == "text/plain":
127
128 # convert file to byte representation
129 base64_content = file._base64
130 # Decode the base64 string into bytes
131 file_bytes = base64.b64decode(base64_content)
132 # add to logger
133 logger.info(f"Successfully decoded {len(file_bytes)} bytes.")
134 # for the LuxASR model, we need to indicate the MIME Type
135 if lang == "lb":
136 # call LuxASR Speech2Text and get transcription
137 text = s2t.speech2text(file_bytes, mime_type)
138 else:
139 text = s2t.speech2text(file_bytes)
140 session.reply("User: " + text)
141 answer = gpt.predict(text)
142 #session.reply(answer)
143 file_text = answer
144 else:
145 # convert file to byte representation
146 base64_content = file._base64
147 # Decode the base64 string into text
148 file_text = base64.b64decode(base64_content).decode('utf-8')
149
150 # call Speech2Text and get transcription
151 session.reply(file_text)
152 websocket_platform.reply_speech(session, file_text)
153
154
155sts_file_state.set_body(stt_file_body)
156sts_file_state.go_to(awaiting_state)
157
158
159if __name__ == '__main__':
160 agent.run()