Speech2Text agent#

  1# You may need to add your working directory to the Python path. To do so, uncomment the following lines of code
  2# import sys
  3# sys.path.append("/Path/to/directory/agentic-framework") # Replace with your directory path
  4
  5# Besser Agentic Framework Hugging Face Speech-to-text example agent
  6
  7# imports
  8import logging
  9import base64
 10
 11from baf.core.agent import Agent
 12from baf.core.session import Session
 13from baf.exceptions.logger import logger
 14
 15from baf.nlp.llm.llm_openai_api import LLMOpenAI
 16from baf.nlp.speech2text.hf_speech2text import HFSpeech2Text
 17
 18from baf.core.file import File
 19from baf.library.transition.events.base_events import ReceiveFileEvent
 20
 21
 22# Configure the logging module (optional)
 23logger.setLevel(logging.INFO)
 24
 25# Create the agent
 26agent = Agent('Huggingface Speech-to-Text Agent')
 27
 28# Load agent properties stored in a dedicated file
 29agent.load_properties('config.yaml')
 30
 31# example models
 32# 'Lemswasabi/wav2vec2-large-xlsr-53-842h-luxembourgish-14h-with-lm'
 33# 'openai/whisper-tiny'
 34# 'openai/whisper-large-v3'
 35
 36# Define the platform your agent will use
 37websocket_platform = agent.use_websocket_platform(use_ui=True)
 38
 39# Define STT Models
 40stt = HFSpeech2Text(agent=agent, model_name="openai/whisper-tiny")
 41
 42# Create the LLM
 43gpt = LLMOpenAI(
 44    agent=agent,
 45    name='gpt-4o-mini',
 46    parameters={},
 47    num_previous_messages=100,
 48)
 49
 50# States
 51initial_state = agent.new_state('initial_state', initial=True)
 52awaiting_state = agent.new_state('awaiting_state') # for awaiting user input
 53stt_state = agent.new_state('stt_state')  # for messages and speech
 54stt_file_state = agent.new_state('stt_file_state')  # for audio files uploaded through the UI
 55
 56
 57# STATES BODIES' DEFINITION + TRANSITIONS
 58
 59def initial_body(session: Session):
 60    answer = gpt.predict(
 61        f"You are a helpful assistant. Start the conversation with a short (2-15 words) greetings message. Make it original.")
 62    session.reply(answer)
 63
 64initial_state.set_body(initial_body)
 65initial_state.go_to(awaiting_state)
 66
 67def awaiting_body(session:Session):
 68    pass
 69
 70awaiting_state.set_body(awaiting_body)
 71awaiting_state.when_file_received(allowed_types=("audio/wav", "audio/mpeg", "audio/mp4")).go_to(stt_file_state)  # Only Allow Wav, MP3, MP4 files
 72awaiting_state.when_no_intent_matched().go_to(stt_state)
 73
 74
 75def stt_body(session: Session):
 76    session.reply("User: " + session.event.message)
 77    answer = gpt.chat(session)
 78    session.reply(answer)
 79
 80
 81stt_state.set_body(stt_body)
 82stt_state.go_to(awaiting_state)
 83
 84
 85# Execute when a file is received
 86def stt_file_body(session: Session):
 87    # get user language
 88    lang = session.get("user_language", "en")
 89    # access STT system based on language mapping
 90    s2t = session._agent._nlp_engine._language_to_speech2text_module[lang]
 91    event: ReceiveFileEvent = session.event
 92    file: File = event.file
 93
 94    # convert file to byte representation
 95    base64_content = file._base64
 96    # Decode the base64 string into bytes
 97    file_bytes = base64.b64decode(base64_content)
 98    # add to logger
 99    logger.info(f"Successfully decoded {len(file_bytes)} bytes for Speech2Text.")
100
101    # call HF Speech2Text and get transcription
102    text = s2t.speech2text(file_bytes)
103    session.reply("User: " + text)
104    answer = gpt.predict(text)
105    session.reply(answer)
106
107
108stt_file_state.set_body(stt_file_body)
109stt_file_state.go_to(awaiting_state)
110
111
112if __name__ == '__main__':
113    agent.run()