Speech-to-text
In this guide you’ll wire a Python sidecar that subscribes to a Smelter input’s audio side channel, runs faster-whisper speech-to-text, and posts the recognised text back to the TypeScript app. The TS app holds the latest line in a Zustand store; the JSX composition reads it and re-renders the subtitle overlay.
The TS app is built up across the steps below in one app.tsx file. The Python
sidecar lives in transcribe.py.
-
Install the TypeScript app’s dependencies and the Python sidecar’s dependencies, then export the directory where Smelter will create the side channel sockets. Both the TS app and the sidecar read it from the environment, so set it once in the shell you run them from.
pnpm add @swmansion/smelter @swmansion/smelter-node react zustandpip install smelter-sdk faster-whisperexport SMELTER_SIDE_CHANNEL_SOCKET_DIR=/tmp/smelter-sockets -
Initialize Smelter.
app.tsx import Smelter from "@swmansion/smelter-node";async function main() {const smelter = new Smelter();await smelter.init();}main().catch(console.error); -
Add a Zustand store (or any other state management) for the current subtitle text, plus the HTTP endpoint the sidecar POSTs to. The endpoint writes to the store from outside React with
useStore.getState().setSubtitle(...), which re-renders the JSX.app.tsx import { create } from "zustand";import http from "node:http";interface SubtitleStore {subtitle: string;setSubtitle: (text: string) => void;}const useStore = create<SubtitleStore>((set) => ({subtitle: "",setSubtitle: (subtitle) => set({ subtitle }),}));http.createServer((req, res) => {if (req.method !== "POST" || req.url !== "/update") {res.statusCode = 404;res.end();return;}let body = "";req.on("data", (chunk) => (body += chunk));req.on("end", () => {const { text } = JSON.parse(body) as { text: string };useStore.getState().setSubtitle(text);res.end();});}).listen(3001, "127.0.0.1"); -
Wire the WHIP input, WHEP output, and the subtitle composition. The input’s
sideChannel.delayMsdelays the output relative to the input, giving the sidecar time to transcribe each chunk before the matching frame is rendered. Matching the chunk length todelayMs(both5000) keeps the subtitle roughly in step with the spoken words, so the store can be updated as soon as a line is recognised.app.tsx 29 collapsed linesimport Smelter from "@swmansion/smelter-node";import http from "node:http";import { create } from "zustand";import { View, InputStream, Text, Rescaler } from "@swmansion/smelter";interface SubtitleStore {subtitle: string;setSubtitle: (text: string) => void;}const useStore = create<SubtitleStore>((set) => ({subtitle: "",setSubtitle: (subtitle) => set({ subtitle }),}));http.createServer((req, res) => {if (req.method !== "POST" || req.url !== "/update") {res.statusCode = 404;res.end();return;}let body = "";req.on("data", (chunk) => (body += chunk));req.on("end", () => {const { text } = JSON.parse(body) as { text: string };useStore.getState().setSubtitle(text);res.end();});}).listen(3001, "127.0.0.1");function Composition() {const subtitle = useStore((s) => s.subtitle);return (<View style={{ width: 1920, height: 1080 }}><Rescaler><InputStream inputId="input" /></Rescaler>{subtitle && (<Viewstyle={{bottom: 40,left: 80,width: 1760,height: 120,backgroundColor: "#000000EE",paddingHorizontal: 40,direction: "column",}}><View /><Textstyle={{width: 1680,fontSize: 40,color: "#FFFFFFFF",align: "center",}}>{subtitle}</Text><View /></View>)}</View>);}async function main() {2 collapsed linesconst smelter = new Smelter();await smelter.init();await smelter.registerInput("input", {type: "whip_server",bearerToken: "example",sideChannel: { audio: true, delayMs: 5000 },});await smelter.registerOutput("output", <Composition />, {type: "whep_server",bearerToken: "example",video: {resolution: { width: 1920, height: 1080 },encoder: { type: "ffmpeg_h264", preset: "ultrafast" },},audio: { encoder: { type: "opus" } },});await smelter.start();}main().catch(console.error);Run the TS app with
tsx app.tsx(or your preferred TypeScript runner). Smelter starts the side channel sockets and waits for a WHIP stream. -
The Python sidecar subscribes to the audio side channel on one thread and runs Whisper on another, then POSTs each recognised segment to the TS app’s HTTP endpoint.
transcribe.py import jsonimport queueimport threadingimport urllib.requestimport numpy as npfrom faster_whisper import WhisperModelfrom smelter import subscribe_audio_channelAPP_URL = "http://127.0.0.1:3001/update"INPUT_ID = "input"WHISPER_SAMPLE_RATE = 16000CHUNK_DURATION_MS = 5000 # matches the input's sideChannel.delayMsdef post(body: dict):req = urllib.request.Request(APP_URL,data=json.dumps(body).encode(),headers={"Content-Type": "application/json"},method="POST",)urllib.request.urlopen(req).read()def main():model = WhisperModel("base", compute_type="int8")chunks: queue.Queue[np.ndarray] = queue.Queue()def reader():buffer = np.empty(0, dtype=np.float32)for batch in subscribe_audio_channel(INPUT_ID):samples = batch.to_mono()if batch.sample_rate != WHISPER_SAMPLE_RATE:ratio = WHISPER_SAMPLE_RATE / batch.sample_ratetarget = int(len(samples) * ratio)idx = np.linspace(0, len(samples) - 1, target)samples = np.interp(idx, np.arange(len(samples)), samples).astype(np.float32)buffer = np.concatenate([buffer, samples])if len(buffer) >= WHISPER_SAMPLE_RATE * CHUNK_DURATION_MS // 1000:chunks.put(buffer)buffer = np.empty(0, dtype=np.float32)threading.Thread(target=reader, daemon=True).start()while True:chunk = chunks.get()segments, _ = model.transcribe(chunk, language="en")for segment in segments:text = segment.text.strip()if text:post({"text": text})if __name__ == "__main__":main()Run it with
python transcribe.py, in the same shell where you exportedSMELTER_SIDE_CHANNEL_SOCKET_DIR. -
Stream a test source and watch the result with Smelter’s hosted browser tools (no install required):
- Publish your microphone (and camera) with the WHIP streamer.
- Watch the composed output with the WHEP player.
The subtitle tracks the spoken words because the chunk length matches the output delay: by the time a line is transcribed, the matching audio is reaching the delayed output.