Setting up real-time speech-to-text on macOS is easier than ever with Apple’s MLX, OpenAI’s Whisper model, and a few Python libraries. In this guide, I’ll walk you through the setup, installation of necessary dependencies, and the code to capture, transcribe, and copy audio input from your microphone to the clipboard. Let’s dive in!
Prerequisites
To get started, ensure you have Python 3 installed on your system. You can verify this by running:
python3 --version
If you don’t have Python installed, download it from python.org.
Required Libraries
The following Python libraries are required to set up real-time speech-to-text:
mlx-whisper: A version of Whisper compatible with MLX models, used for speech transcription.pyaudio: For capturing audio from the microphone.numpy: To process audio data in an efficient format.pyperclip: For copying the transcribed text to the clipboard.
Install these libraries via pip:
pip install mlx-whisper pyaudio numpy pyperclip
Full Code for Real-Time Transcription
The following Python script sets up a real-time transcription program that listens for audio input, transcribes it, and copies the text to the clipboard. It supports two modes:
- Single Transcription Mode (
--single): Captures a single speech input, prints the result, copies it to the clipboard, and exits. - Continuous Mode: Continuously listens for speech, transcribes each phrase, copies it to the clipboard, and restarts. The program exits if it transcribes the word “exit.”
Save this code as rt.py:
import argparse
import mlx_whisper
import pyaudio
import numpy as np
import pyperclip
# Model configuration
MODEL_NAME = "mlx-community/whisper-tiny" # Smaller model for faster processing
# PyAudio configuration
FORMAT = pyaudio.paInt16 # Audio format (16-bit int)
CHANNELS = 1 # Number of audio channels (mono)
RATE = 16000 # Sampling rate (16 kHz)
CHUNK = 1024 # Buffer size
SILENCE_THRESHOLD = 500 # Amplitude threshold for detecting silence
SILENCE_CHUNKS = 10 # Number of consecutive chunks of silence before stopping
def transcribe_audio(single_mode=False, output_file=None):
# Initialize PyAudio
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE,
input=True, frames_per_buffer=CHUNK)
print("Listening...")
frames = [] # List to store audio chunks
silent_chunks = 0
print("Waiting for speech...")
# Listen until we detect speech
while True:
# Read audio data from the microphone
data = stream.read(CHUNK, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.int16)
# Check if audio_data exceeds the silence threshold
if np.max(np.abs(audio_data)) < SILENCE_THRESHOLD:
silent_chunks += 1
else:
silent_chunks = 0
# If we have enough silence chunks, consider it the end of the speech
if silent_chunks > SILENCE_CHUNKS:
break
# Accumulate frames if we detect sound above the threshold
frames.append(audio_data.astype(np.float32) / 32768.0)
# Concatenate all audio data in frames for a single transcription
if frames:
audio_data = np.concatenate(frames)
# Process audio with mlx_whisper
result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=MODEL_NAME)
# Print the transcribed text
transcription = result["text"].strip().lower() # Normalize text for comparison
print("Transcription:", transcription)
# Copy transcription to the pasteboard
pyperclip.copy(transcription)
print("Text copied to pasteboard.")
# Write transcription to file if specified
if output_file:
with open(output_file, 'w') as f:
f.write(transcription)
print(f"Transcription saved to {output_file}")
# Check for "exit" command
if transcription == "exit":
print("Exit command received. Stopping program.")
stream.stop_stream()
stream.close()
audio.terminate()
return False # Signal to stop the continuous loop
# Stop the stream
stream.stop_stream()
stream.close()
audio.terminate()
if single_mode:
return True # In single mode, return immediately
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Real-time speech-to-text transcription program.")
parser.add_argument("--single", action="store_true",
help="Capture a single speech input, transcribe it, copy it to the pasteboard, print the result, and exit.")
parser.add_argument("--output-file", type=str,
help="Specify a file to save the transcription output. Only used in --single mode.")
args = parser.parse_args()
# Run the transcription in the desired mode
if args.single:
transcribe_audio(single_mode=True, output_file=args.output_file)
else:
while True:
if not transcribe_audio():
break # Exit if transcribe_audio returns False
print("Press Enter to start listening again...")
input() # Wait for user to press Enter
Running the Program
You can use this script in several ways. And be sure to start talking or it won’t capture anything. Blurt it out!
- Single Transcription, Output to Console and Clipboard:
python rt.py --single
- Single Transcription, Save to File:
python rt.py --single --output-file transcription.txt
- Continuous Mode (stops if “exit” is detected):
python rt.py
Use --help to view all options:
python rt.py --help
This setup provides a versatile tool for real-time speech-to-text on macOS, useful for meetings, note-taking, or just experimenting with speech recognition!
Adding It To Your Toolchain
Let’s extend the program so that it can have it send to stdout and make it into an executable
#!/usr/bin/env python3
import argparse
import sys
import mlx_whisper
import pyaudio
import numpy as np
import pyperclip
# Model configuration
MODEL_NAME = "mlx-community/whisper-tiny" # Smaller model for faster processing
# PyAudio configuration
FORMAT = pyaudio.paInt16 # Audio format (16-bit int)
CHANNELS = 1 # Number of audio channels (mono)
RATE = 16000 # Sampling rate (16 kHz)
CHUNK = 1024 # Buffer size
SILENCE_THRESHOLD = 500 # Amplitude threshold for detecting silence
SILENCE_CHUNKS = 30 # Number of consecutive chunks of silence before stopping
def transcribe_audio(single_mode=False, interactive_mode=False, output_file=None, copy_to_clipboard=False):
# Initialize PyAudio
audio = pyaudio.PyAudio()
stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE,
input=True, frames_per_buffer=CHUNK)
while True:
print("Listening...", file=sys.stderr)
frames = [] # List to store audio chunks
silent_chunks = 0
print("Waiting for speech...", file=sys.stderr)
# Listen until we detect speech
while True:
# Read audio data from the microphone
data = stream.read(CHUNK, exception_on_overflow=False)
audio_data = np.frombuffer(data, dtype=np.int16)
# Check if audio_data exceeds the silence threshold
if np.max(np.abs(audio_data)) < SILENCE_THRESHOLD:
silent_chunks += 1
else:
silent_chunks = 0
# If we have enough silence chunks, consider it the end of the speech
if silent_chunks > SILENCE_CHUNKS:
break
# Accumulate frames if we detect sound above the threshold
frames.append(audio_data.astype(np.float32) / 32768.0)
# Concatenate all audio data in frames for a single transcription
if frames:
audio_data = np.concatenate(frames)
# Process audio with mlx_whisper
result = mlx_whisper.transcribe(audio_data, path_or_hf_repo=MODEL_NAME)
# Get the transcribed text
transcription = result["text"].strip().lower() # Normalize text for comparison
# Output to stdout for piping
print(transcription)
# Copy transcription to the pasteboard if specified
if copy_to_clipboard:
pyperclip.copy(transcription)
print("Text copied to pasteboard.", file=sys.stderr) # Notify to stderr
# Write transcription to file if specified
if output_file:
with open(output_file, 'w') as f:
f.write(transcription)
print(f"Transcription saved to {output_file}", file=sys.stderr) # Notify to stderr
# Check for "exit" command to stop in interactive mode
if transcription == "exit":
print("Exit command received. Stopping program.", file=sys.stderr)
stream.stop_stream()
stream.close()
audio.terminate()
return False # Signal to stop the loop
# Stop if in single mode or if exit command was given
if single_mode or not interactive_mode:
stream.stop_stream()
stream.close()
audio.terminate()
break
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Real-time speech-to-text transcription program.")
parser.add_argument("--single", action="store_true",
help="Capture a single speech input, transcribe it, and exit.")
parser.add_argument("--interactive", action="store_true",
help="Run in interactive mode, continuously listening for speech.")
parser.add_argument("--output-file", type=str,
help="Specify a file to save the transcription output. Only used in --single mode.")
parser.add_argument("--copy", action="store_true",
help="Copy the transcribed text to the clipboard.")
args = parser.parse_args()
# Run the transcription in the desired mode
if args.single:
transcribe_audio(single_mode=True, output_file=args.output_file, copy_to_clipboard=args.copy)
elif args.interactive:
while True:
if not transcribe_audio(interactive_mode=True, copy_to_clipboard=args.copy):
break # Exit if "exit" command is given
print("Press Enter to start listening again...", file=sys.stderr)
input() # Wait for user to press Enter
else:
print("Please specify a mode: --single or --interactive.", file=sys.stderr)
Make It Executable
# Copy the script to /usr/local/bin
sudo cp rt.py /usr/local/bin/mlxw
# Make the script executable
sudo chmod +x /usr/local/bin/mlxw
Usage Examples
Now that the script outputs to stdout, you can pipe the transcription directly to other commands or files.
- Single Transcription, Output to File:
mlxw --single > output.txt
- Single Transcription, Copy to Clipboard, and Save to File:
mlxw --single --copy --output-file transcription.txt
- Continuous Mode, Piping to Another Command:
mlxw | grep "keyword"
which will highlight the word grep finds.
- Continuous Mode with Clipboard Copy:
mlxw --copy
This setup gives you the flexibility to use mlxw in various ways, including piping output to other commands, redirecting it to files, and optionally copying it to the clipboard.