Speech2Text agent#
1# You may need to add your working directory to the Python path. To do so, uncomment the following lines of code
2# import sys
3# sys.path.append("/Path/to/directory/agentic-framework") # Replace with your directory path
4
5# Besser Agentic Framework Hugging Face Speech-to-text example agent
6
7# imports
8import logging
9import base64
10
11from baf.core.agent import Agent
12from baf.core.session import Session
13from baf.exceptions.logger import logger
14
15from baf.nlp.llm.llm_openai_api import LLMOpenAI
16from baf.nlp.speech2text.hf_speech2text import HFSpeech2Text
17
18from baf.core.file import File
19from baf.library.transition.events.base_events import ReceiveFileEvent
20
21
22# Configure the logging module (optional)
23logger.setLevel(logging.INFO)
24
25# Create the agent
26agent = Agent('Huggingface Speech-to-Text Agent')
27
28# Load agent properties stored in a dedicated file
29agent.load_properties('config.yaml')
30
31# example models
32# 'Lemswasabi/wav2vec2-large-xlsr-53-842h-luxembourgish-14h-with-lm'
33# 'openai/whisper-tiny'
34# 'openai/whisper-large-v3'
35
36# Define the platform your agent will use
37websocket_platform = agent.use_websocket_platform(use_ui=True)
38
39# Define STT Models
40stt = HFSpeech2Text(agent=agent, model_name="openai/whisper-tiny")
41
42# Create the LLM
43gpt = LLMOpenAI(
44 agent=agent,
45 name='gpt-4o-mini',
46 parameters={},
47 num_previous_messages=100,
48)
49
50# States
51initial_state = agent.new_state('initial_state', initial=True)
52awaiting_state = agent.new_state('awaiting_state') # for awaiting user input
53stt_state = agent.new_state('stt_state') # for messages and speech
54stt_file_state = agent.new_state('stt_file_state') # for audio files uploaded through the UI
55
56
57# STATES BODIES' DEFINITION + TRANSITIONS
58
59def initial_body(session: Session):
60 answer = gpt.predict(
61 f"You are a helpful assistant. Start the conversation with a short (2-15 words) greetings message. Make it original.")
62 session.reply(answer)
63
64initial_state.set_body(initial_body)
65initial_state.go_to(awaiting_state)
66
67def awaiting_body(session:Session):
68 pass
69
70awaiting_state.set_body(awaiting_body)
71awaiting_state.when_file_received(allowed_types=("audio/wav", "audio/mpeg", "audio/mp4")).go_to(stt_file_state) # Only Allow Wav, MP3, MP4 files
72awaiting_state.when_no_intent_matched().go_to(stt_state)
73
74
75def stt_body(session: Session):
76 session.reply("User: " + session.event.message)
77 answer = gpt.chat(session)
78 session.reply(answer)
79
80
81stt_state.set_body(stt_body)
82stt_state.go_to(awaiting_state)
83
84
85# Execute when a file is received
86def stt_file_body(session: Session):
87 # get user language
88 lang = session.get("user_language", "en")
89 # access STT system based on language mapping
90 s2t = session._agent._nlp_engine._language_to_speech2text_module[lang]
91 event: ReceiveFileEvent = session.event
92 file: File = event.file
93
94 # convert file to byte representation
95 base64_content = file._base64
96 # Decode the base64 string into bytes
97 file_bytes = base64.b64decode(base64_content)
98 # add to logger
99 logger.info(f"Successfully decoded {len(file_bytes)} bytes for Speech2Text.")
100
101 # call HF Speech2Text and get transcription
102 text = s2t.speech2text(file_bytes)
103 session.reply("User: " + text)
104 answer = gpt.predict(text)
105 session.reply(answer)
106
107
108stt_file_state.set_body(stt_file_body)
109stt_file_state.go_to(awaiting_state)
110
111
112if __name__ == '__main__':
113 agent.run()