Upload 13 files
Browse files- .gitattributes +1 -0
- eslint.config.js +33 -0
- index.html +13 -0
- package-lock.json +0 -0
- package.json +25 -32
- public/logo.png +3 -0
- src/App.jsx +367 -0
- src/constants.js +53 -0
- src/index.css +12 -0
- src/main.jsx +10 -0
- src/play-worklet.js +73 -0
- src/vad-processor.js +37 -0
- src/worker.js +355 -0
- vite.config.js +19 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
public/logo.png filter=lfs diff=lfs merge=lfs -text
|
eslint.config.js
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import js from "@eslint/js";
|
| 2 |
+
import globals from "globals";
|
| 3 |
+
import reactHooks from "eslint-plugin-react-hooks";
|
| 4 |
+
import reactRefresh from "eslint-plugin-react-refresh";
|
| 5 |
+
|
| 6 |
+
export default [
|
| 7 |
+
{ ignores: ["dist"] },
|
| 8 |
+
{
|
| 9 |
+
files: ["**/*.{js,jsx}"],
|
| 10 |
+
languageOptions: {
|
| 11 |
+
ecmaVersion: 2020,
|
| 12 |
+
globals: globals.browser,
|
| 13 |
+
parserOptions: {
|
| 14 |
+
ecmaVersion: "latest",
|
| 15 |
+
ecmaFeatures: { jsx: true },
|
| 16 |
+
sourceType: "module",
|
| 17 |
+
},
|
| 18 |
+
},
|
| 19 |
+
plugins: {
|
| 20 |
+
"react-hooks": reactHooks,
|
| 21 |
+
"react-refresh": reactRefresh,
|
| 22 |
+
},
|
| 23 |
+
rules: {
|
| 24 |
+
...js.configs.recommended.rules,
|
| 25 |
+
...reactHooks.configs.recommended.rules,
|
| 26 |
+
"no-unused-vars": ["error", { varsIgnorePattern: "^[A-Z_]" }],
|
| 27 |
+
"react-refresh/only-export-components": [
|
| 28 |
+
"warn",
|
| 29 |
+
{ allowConstantExport: true },
|
| 30 |
+
],
|
| 31 |
+
},
|
| 32 |
+
},
|
| 33 |
+
];
|
index.html
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<link rel="icon" type="image/png" href="/logo.png" />
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
+
<title>Transformers.js | Speech-to-speech demo</title>
|
| 8 |
+
</head>
|
| 9 |
+
<body>
|
| 10 |
+
<div id="root"></div>
|
| 11 |
+
<script type="module" src="/src/main.jsx"></script>
|
| 12 |
+
</body>
|
| 13 |
+
</html>
|
package-lock.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
package.json
CHANGED
|
@@ -1,39 +1,32 @@
|
|
| 1 |
{
|
| 2 |
-
"name": "
|
| 3 |
-
"version": "0.1.0",
|
| 4 |
"private": true,
|
| 5 |
-
"
|
| 6 |
-
|
| 7 |
-
"@testing-library/jest-dom": "^6.6.3",
|
| 8 |
-
"@testing-library/react": "^16.3.0",
|
| 9 |
-
"@testing-library/user-event": "^13.5.0",
|
| 10 |
-
"react": "^19.1.0",
|
| 11 |
-
"react-dom": "^19.1.0",
|
| 12 |
-
"react-scripts": "5.0.1",
|
| 13 |
-
"web-vitals": "^2.1.4"
|
| 14 |
-
},
|
| 15 |
"scripts": {
|
| 16 |
-
"
|
| 17 |
-
"build": "
|
| 18 |
-
"
|
| 19 |
-
"
|
| 20 |
},
|
| 21 |
-
"
|
| 22 |
-
"
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
},
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
]
|
| 38 |
}
|
| 39 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"name": "speech-to-speech",
|
|
|
|
| 3 |
"private": true,
|
| 4 |
+
"version": "0.0.0",
|
| 5 |
+
"type": "module",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "vite build",
|
| 9 |
+
"lint": "eslint .",
|
| 10 |
+
"preview": "vite preview"
|
| 11 |
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@huggingface/transformers": "^3.5.2",
|
| 14 |
+
"@tailwindcss/vite": "^4.1.4",
|
| 15 |
+
"kokoro-js": "^1.2.1",
|
| 16 |
+
"lucide-react": "^0.503.0",
|
| 17 |
+
"react": "^19.0.0",
|
| 18 |
+
"react-dom": "^19.0.0",
|
| 19 |
+
"tailwindcss": "^4.1.4"
|
| 20 |
},
|
| 21 |
+
"devDependencies": {
|
| 22 |
+
"@eslint/js": "^9.22.0",
|
| 23 |
+
"@types/react": "^19.0.10",
|
| 24 |
+
"@types/react-dom": "^19.0.4",
|
| 25 |
+
"@vitejs/plugin-react": "^4.3.4",
|
| 26 |
+
"eslint": "^9.22.0",
|
| 27 |
+
"eslint-plugin-react-hooks": "^5.2.0",
|
| 28 |
+
"eslint-plugin-react-refresh": "^0.4.19",
|
| 29 |
+
"globals": "^16.0.0",
|
| 30 |
+
"vite": "^6.3.1"
|
|
|
|
| 31 |
}
|
| 32 |
}
|
public/logo.png
ADDED
|
Git LFS Details
|
src/App.jsx
ADDED
|
@@ -0,0 +1,367 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useEffect, useState, useRef } from "react";
|
| 2 |
+
import { Mic, PhoneOff, ChevronDown } from "lucide-react";
|
| 3 |
+
import { INPUT_SAMPLE_RATE } from "./constants";
|
| 4 |
+
|
| 5 |
+
import WORKLET from "./play-worklet.js";
|
| 6 |
+
|
| 7 |
+
export default function App() {
|
| 8 |
+
const [callStartTime, setCallStartTime] = useState(null);
|
| 9 |
+
const [callStarted, setCallStarted] = useState(false);
|
| 10 |
+
const [playing, setPlaying] = useState(false);
|
| 11 |
+
|
| 12 |
+
const [voice, setVoice] = useState("af_heart");
|
| 13 |
+
const [voices, setVoices] = useState([]);
|
| 14 |
+
|
| 15 |
+
const [isListening, setIsListening] = useState(false);
|
| 16 |
+
const [isSpeaking, setIsSpeaking] = useState(false);
|
| 17 |
+
const [listeningScale, setListeningScale] = useState(1);
|
| 18 |
+
const [speakingScale, setSpeakingScale] = useState(1);
|
| 19 |
+
const [ripples, setRipples] = useState([]);
|
| 20 |
+
|
| 21 |
+
const [ready, setReady] = useState(false);
|
| 22 |
+
const [error, setError] = useState(null);
|
| 23 |
+
const [elapsedTime, setElapsedTime] = useState("00:00");
|
| 24 |
+
const worker = useRef(null);
|
| 25 |
+
|
| 26 |
+
const node = useRef(null);
|
| 27 |
+
|
| 28 |
+
useEffect(() => {
|
| 29 |
+
worker.current?.postMessage({
|
| 30 |
+
type: "set_voice",
|
| 31 |
+
voice,
|
| 32 |
+
});
|
| 33 |
+
}, [voice]);
|
| 34 |
+
|
| 35 |
+
useEffect(() => {
|
| 36 |
+
if (!callStarted) {
|
| 37 |
+
// Reset worker state after call ends
|
| 38 |
+
worker.current?.postMessage({
|
| 39 |
+
type: "end_call",
|
| 40 |
+
});
|
| 41 |
+
}
|
| 42 |
+
}, [callStarted]);
|
| 43 |
+
|
| 44 |
+
useEffect(() => {
|
| 45 |
+
if (callStarted && callStartTime) {
|
| 46 |
+
const interval = setInterval(() => {
|
| 47 |
+
const diff = Math.floor((Date.now() - callStartTime) / 1000);
|
| 48 |
+
const minutes = String(Math.floor(diff / 60)).padStart(2, "0");
|
| 49 |
+
const seconds = String(diff % 60).padStart(2, "0");
|
| 50 |
+
setElapsedTime(`${minutes}:${seconds}`);
|
| 51 |
+
}, 1000);
|
| 52 |
+
return () => clearInterval(interval);
|
| 53 |
+
} else {
|
| 54 |
+
setElapsedTime("00:00");
|
| 55 |
+
}
|
| 56 |
+
}, [callStarted, callStartTime]);
|
| 57 |
+
|
| 58 |
+
useEffect(() => {
|
| 59 |
+
worker.current ??= new Worker(new URL("./worker.js", import.meta.url), {
|
| 60 |
+
type: "module",
|
| 61 |
+
});
|
| 62 |
+
|
| 63 |
+
const onMessage = ({ data }) => {
|
| 64 |
+
console.log("Worker message:", data);
|
| 65 |
+
if (data.error) {
|
| 66 |
+
return onError(data.error);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
switch (data.type) {
|
| 70 |
+
case "status":
|
| 71 |
+
if (data.status === "recording_start") {
|
| 72 |
+
setIsListening(true);
|
| 73 |
+
setIsSpeaking(false);
|
| 74 |
+
} else if (data.status === "recording_end") {
|
| 75 |
+
setIsListening(false);
|
| 76 |
+
} else if (data.status === "ready") {
|
| 77 |
+
setVoices(data.voices);
|
| 78 |
+
setReady(true);
|
| 79 |
+
}
|
| 80 |
+
break;
|
| 81 |
+
case "output":
|
| 82 |
+
if (!playing) {
|
| 83 |
+
node.current?.port.postMessage(data.result.audio);
|
| 84 |
+
setPlaying(true);
|
| 85 |
+
setIsSpeaking(true);
|
| 86 |
+
setIsListening(false);
|
| 87 |
+
}
|
| 88 |
+
break;
|
| 89 |
+
}
|
| 90 |
+
};
|
| 91 |
+
const onError = (err) => setError(err.message);
|
| 92 |
+
|
| 93 |
+
worker.current.addEventListener("message", onMessage);
|
| 94 |
+
worker.current.addEventListener("error", onError);
|
| 95 |
+
|
| 96 |
+
return () => {
|
| 97 |
+
worker.current.removeEventListener("message", onMessage);
|
| 98 |
+
worker.current.removeEventListener("error", onError);
|
| 99 |
+
};
|
| 100 |
+
}, []);
|
| 101 |
+
|
| 102 |
+
useEffect(() => {
|
| 103 |
+
if (!callStarted) return;
|
| 104 |
+
|
| 105 |
+
let worklet;
|
| 106 |
+
let inputAudioContext;
|
| 107 |
+
let source;
|
| 108 |
+
let ignore = false;
|
| 109 |
+
|
| 110 |
+
let outputAudioContext;
|
| 111 |
+
const audioStreamPromise = navigator.mediaDevices.getUserMedia({
|
| 112 |
+
audio: {
|
| 113 |
+
channelCount: 1,
|
| 114 |
+
echoCancellation: true,
|
| 115 |
+
autoGainControl: true,
|
| 116 |
+
noiseSuppression: true,
|
| 117 |
+
sampleRate: INPUT_SAMPLE_RATE,
|
| 118 |
+
},
|
| 119 |
+
});
|
| 120 |
+
|
| 121 |
+
audioStreamPromise
|
| 122 |
+
.then(async (stream) => {
|
| 123 |
+
if (ignore) return;
|
| 124 |
+
|
| 125 |
+
inputAudioContext = new (window.AudioContext ||
|
| 126 |
+
window.webkitAudioContext)({
|
| 127 |
+
sampleRate: INPUT_SAMPLE_RATE,
|
| 128 |
+
});
|
| 129 |
+
|
| 130 |
+
const analyser = inputAudioContext.createAnalyser();
|
| 131 |
+
analyser.fftSize = 256;
|
| 132 |
+
source = inputAudioContext.createMediaStreamSource(stream);
|
| 133 |
+
source.connect(analyser);
|
| 134 |
+
|
| 135 |
+
const inputDataArray = new Uint8Array(analyser.frequencyBinCount);
|
| 136 |
+
|
| 137 |
+
function calculateRMS(array) {
|
| 138 |
+
let sum = 0;
|
| 139 |
+
for (let i = 0; i < array.length; ++i) {
|
| 140 |
+
const normalized = array[i] / 128 - 1;
|
| 141 |
+
sum += normalized * normalized;
|
| 142 |
+
}
|
| 143 |
+
const rms = Math.sqrt(sum / array.length);
|
| 144 |
+
return rms;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
await inputAudioContext.audioWorklet.addModule(
|
| 148 |
+
new URL("./vad-processor.js", import.meta.url),
|
| 149 |
+
);
|
| 150 |
+
worklet = new AudioWorkletNode(inputAudioContext, "vad-processor", {
|
| 151 |
+
numberOfInputs: 1,
|
| 152 |
+
numberOfOutputs: 0,
|
| 153 |
+
channelCount: 1,
|
| 154 |
+
channelCountMode: "explicit",
|
| 155 |
+
channelInterpretation: "discrete",
|
| 156 |
+
});
|
| 157 |
+
|
| 158 |
+
source.connect(worklet);
|
| 159 |
+
worklet.port.onmessage = (event) => {
|
| 160 |
+
const { buffer } = event.data;
|
| 161 |
+
worker.current?.postMessage({ type: "audio", buffer });
|
| 162 |
+
};
|
| 163 |
+
|
| 164 |
+
outputAudioContext = new AudioContext({
|
| 165 |
+
sampleRate: 24000,
|
| 166 |
+
});
|
| 167 |
+
outputAudioContext.resume();
|
| 168 |
+
|
| 169 |
+
const blob = new Blob([`(${WORKLET.toString()})()`], {
|
| 170 |
+
type: "application/javascript",
|
| 171 |
+
});
|
| 172 |
+
const url = URL.createObjectURL(blob);
|
| 173 |
+
await outputAudioContext.audioWorklet.addModule(url);
|
| 174 |
+
URL.revokeObjectURL(url);
|
| 175 |
+
|
| 176 |
+
node.current = new AudioWorkletNode(
|
| 177 |
+
outputAudioContext,
|
| 178 |
+
"buffered-audio-worklet-processor",
|
| 179 |
+
);
|
| 180 |
+
|
| 181 |
+
node.current.port.onmessage = (event) => {
|
| 182 |
+
if (event.data.type === "playback_ended") {
|
| 183 |
+
setPlaying(false);
|
| 184 |
+
setIsSpeaking(false);
|
| 185 |
+
worker.current?.postMessage({ type: "playback_ended" });
|
| 186 |
+
}
|
| 187 |
+
};
|
| 188 |
+
|
| 189 |
+
const outputAnalyser = outputAudioContext.createAnalyser();
|
| 190 |
+
outputAnalyser.fftSize = 256;
|
| 191 |
+
|
| 192 |
+
node.current.connect(outputAnalyser);
|
| 193 |
+
outputAnalyser.connect(outputAudioContext.destination);
|
| 194 |
+
|
| 195 |
+
const outputDataArray = new Uint8Array(
|
| 196 |
+
outputAnalyser.frequencyBinCount,
|
| 197 |
+
);
|
| 198 |
+
|
| 199 |
+
function updateVisualizers() {
|
| 200 |
+
analyser.getByteTimeDomainData(inputDataArray);
|
| 201 |
+
const rms = calculateRMS(inputDataArray);
|
| 202 |
+
const targetScale = 1 + Math.min(1.25 * rms, 0.25);
|
| 203 |
+
setListeningScale((prev) => prev + (targetScale - prev) * 0.25);
|
| 204 |
+
|
| 205 |
+
outputAnalyser.getByteTimeDomainData(outputDataArray);
|
| 206 |
+
const outputRMS = calculateRMS(outputDataArray);
|
| 207 |
+
const targetOutputScale = 1 + Math.min(1.25 * outputRMS, 0.25);
|
| 208 |
+
setSpeakingScale((prev) => prev + (targetOutputScale - prev) * 0.25);
|
| 209 |
+
|
| 210 |
+
requestAnimationFrame(updateVisualizers);
|
| 211 |
+
}
|
| 212 |
+
updateVisualizers();
|
| 213 |
+
})
|
| 214 |
+
.catch((err) => {
|
| 215 |
+
setError(err.message);
|
| 216 |
+
console.error(err);
|
| 217 |
+
});
|
| 218 |
+
|
| 219 |
+
return () => {
|
| 220 |
+
ignore = true;
|
| 221 |
+
|
| 222 |
+
audioStreamPromise.then((stream) =>
|
| 223 |
+
stream.getTracks().forEach((track) => track.stop()),
|
| 224 |
+
);
|
| 225 |
+
source?.disconnect();
|
| 226 |
+
worklet?.disconnect();
|
| 227 |
+
inputAudioContext?.close();
|
| 228 |
+
|
| 229 |
+
outputAudioContext?.close();
|
| 230 |
+
};
|
| 231 |
+
}, [callStarted]);
|
| 232 |
+
|
| 233 |
+
useEffect(() => {
|
| 234 |
+
if (!callStarted) return;
|
| 235 |
+
const interval = setInterval(() => {
|
| 236 |
+
const id = Date.now();
|
| 237 |
+
setRipples((prev) => [...prev, id]);
|
| 238 |
+
setTimeout(() => {
|
| 239 |
+
setRipples((prev) => prev.filter((r) => r !== id));
|
| 240 |
+
}, 1500);
|
| 241 |
+
}, 1000);
|
| 242 |
+
return () => clearInterval(interval);
|
| 243 |
+
}, [callStarted]);
|
| 244 |
+
|
| 245 |
+
return (
|
| 246 |
+
<div className="h-screen min-h-[240px] flex items-center justify-center bg-gray-50 p-4 relative">
|
| 247 |
+
<div className="h-full max-h-[320px] w-[640px] bg-white rounded-xl shadow-lg p-8 flex items-center justify-between space-x-16">
|
| 248 |
+
<div className="text-green-700 w-[140px]">
|
| 249 |
+
<div className="text-xl font-bold flex justify-between">
|
| 250 |
+
{voices?.[voice]?.name}
|
| 251 |
+
<span className="font-normal text-gray-500">{elapsedTime}</span>
|
| 252 |
+
</div>
|
| 253 |
+
<div className="text-base relative">
|
| 254 |
+
<button
|
| 255 |
+
type="button"
|
| 256 |
+
disabled={!ready}
|
| 257 |
+
className={`w-full flex items-center justify-between border border-gray-300 rounded-md transition-colors ${
|
| 258 |
+
ready
|
| 259 |
+
? "bg-transparent hover:border-gray-400"
|
| 260 |
+
: "bg-gray-100 opacity-50 cursor-not-allowed"
|
| 261 |
+
}`}
|
| 262 |
+
>
|
| 263 |
+
<span className="px-2 py-1">Select voice</span>
|
| 264 |
+
<ChevronDown className="absolute right-2" />
|
| 265 |
+
</button>
|
| 266 |
+
<select
|
| 267 |
+
value={voice}
|
| 268 |
+
onChange={(e) => setVoice(e.target.value)}
|
| 269 |
+
className="absolute inset-0 opacity-0 cursor-pointer"
|
| 270 |
+
disabled={!ready}
|
| 271 |
+
>
|
| 272 |
+
{Object.entries(voices).map(([key, v]) => (
|
| 273 |
+
<option key={key} value={key}>
|
| 274 |
+
{`${v.name} (${
|
| 275 |
+
v.language === "en-us" ? "American" : v.language
|
| 276 |
+
} ${v.gender})`}
|
| 277 |
+
</option>
|
| 278 |
+
))}
|
| 279 |
+
</select>
|
| 280 |
+
</div>
|
| 281 |
+
</div>
|
| 282 |
+
|
| 283 |
+
<div className="relative flex items-center justify-center w-32 h-32 flex-shrink-0 aspect-square">
|
| 284 |
+
{callStarted &&
|
| 285 |
+
ripples.map((id) => (
|
| 286 |
+
<div
|
| 287 |
+
key={id}
|
| 288 |
+
className="absolute inset-0 rounded-full border-2 border-green-200 pointer-events-none"
|
| 289 |
+
style={{ animation: "ripple 1.5s ease-out forwards" }}
|
| 290 |
+
/>
|
| 291 |
+
))}
|
| 292 |
+
<div className="absolute z-10 text-lg text-gray-700">
|
| 293 |
+
{!ready ? "Loading..." : ""}
|
| 294 |
+
{isListening && "Listening..."}
|
| 295 |
+
{isSpeaking && "Speaking..."}
|
| 296 |
+
</div>
|
| 297 |
+
{/* Pulsing loader while initializing */}
|
| 298 |
+
<div
|
| 299 |
+
className={`absolute w-32 h-32 rounded-full bg-green-200 ${
|
| 300 |
+
!ready ? "animate-ping opacity-75" : ""
|
| 301 |
+
}`}
|
| 302 |
+
style={{ animationDuration: "1.5s" }}
|
| 303 |
+
/>
|
| 304 |
+
{/* Main rings */}
|
| 305 |
+
<div
|
| 306 |
+
className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-300 ${
|
| 307 |
+
!ready ? "opacity-0" : ""
|
| 308 |
+
}`}
|
| 309 |
+
style={{ transform: `scale(${speakingScale})` }}
|
| 310 |
+
/>
|
| 311 |
+
<div
|
| 312 |
+
className={`absolute w-32 h-32 rounded-full shadow-inner transition-transform duration-300 ease-out bg-green-200 ${
|
| 313 |
+
!ready ? "opacity-0" : ""
|
| 314 |
+
}`}
|
| 315 |
+
style={{ transform: `scale(${listeningScale})` }}
|
| 316 |
+
/>
|
| 317 |
+
</div>
|
| 318 |
+
|
| 319 |
+
<div className="space-y-4 w-[140px]">
|
| 320 |
+
{callStarted ? (
|
| 321 |
+
<button
|
| 322 |
+
className="flex items-center space-x-2 px-4 py-2 bg-red-100 text-red-700 rounded-md hover:bg-red-200"
|
| 323 |
+
onClick={() => {
|
| 324 |
+
setCallStarted(false);
|
| 325 |
+
setCallStartTime(null);
|
| 326 |
+
setPlaying(false);
|
| 327 |
+
setIsListening(false);
|
| 328 |
+
setIsSpeaking(false);
|
| 329 |
+
}}
|
| 330 |
+
>
|
| 331 |
+
<PhoneOff className="w-5 h-5" />
|
| 332 |
+
<span>End call</span>
|
| 333 |
+
</button>
|
| 334 |
+
) : (
|
| 335 |
+
<button
|
| 336 |
+
className={`flex items-center space-x-2 px-4 py-2 rounded-md ${
|
| 337 |
+
ready
|
| 338 |
+
? "bg-blue-100 text-blue-700 hover:bg-blue-200"
|
| 339 |
+
: "bg-blue-100 text-blue-700 opacity-50 cursor-not-allowed"
|
| 340 |
+
}`}
|
| 341 |
+
onClick={() => {
|
| 342 |
+
setCallStartTime(Date.now());
|
| 343 |
+
setCallStarted(true);
|
| 344 |
+
worker.current?.postMessage({ type: "start_call" });
|
| 345 |
+
}}
|
| 346 |
+
disabled={!ready}
|
| 347 |
+
>
|
| 348 |
+
<span>Start call</span>
|
| 349 |
+
</button>
|
| 350 |
+
)}
|
| 351 |
+
</div>
|
| 352 |
+
</div>
|
| 353 |
+
|
| 354 |
+
<div className="absolute bottom-4 text-sm">
|
| 355 |
+
Built with{" "}
|
| 356 |
+
<a
|
| 357 |
+
href="https://github.com/huggingface/transformers.js"
|
| 358 |
+
rel="noopener noreferrer"
|
| 359 |
+
target="_blank"
|
| 360 |
+
className="text-blue-600 hover:underline"
|
| 361 |
+
>
|
| 362 |
+
🤗 Transformers.js
|
| 363 |
+
</a>
|
| 364 |
+
</div>
|
| 365 |
+
</div>
|
| 366 |
+
);
|
| 367 |
+
}
|
src/constants.js
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Sample rate of the input audio.
|
| 3 |
+
* Coindicentally, this is the same for both models (Moonshine and Silero VAD)
|
| 4 |
+
*/
|
| 5 |
+
export const INPUT_SAMPLE_RATE = 16000;
|
| 6 |
+
const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000;
|
| 7 |
+
|
| 8 |
+
/**
|
| 9 |
+
* Probabilities ABOVE this value are considered as SPEECH
|
| 10 |
+
*/
|
| 11 |
+
export const SPEECH_THRESHOLD = 0.3;
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* If current state is SPEECH, and the probability of the next state
|
| 15 |
+
* is below this value, it is considered as NON-SPEECH.
|
| 16 |
+
*/
|
| 17 |
+
export const EXIT_THRESHOLD = 0.1;
|
| 18 |
+
|
| 19 |
+
/**
|
| 20 |
+
* After each speech chunk, wait for at least this amount of silence
|
| 21 |
+
* before considering the next chunk as a new speech chunk
|
| 22 |
+
*/
|
| 23 |
+
export const MIN_SILENCE_DURATION_MS = 400;
|
| 24 |
+
export const MIN_SILENCE_DURATION_SAMPLES =
|
| 25 |
+
MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS;
|
| 26 |
+
|
| 27 |
+
/**
|
| 28 |
+
* Pad the speech chunk with this amount each side
|
| 29 |
+
*/
|
| 30 |
+
export const SPEECH_PAD_MS = 80;
|
| 31 |
+
export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS;
|
| 32 |
+
|
| 33 |
+
/**
|
| 34 |
+
* Final speech chunks below this duration are discarded
|
| 35 |
+
*/
|
| 36 |
+
export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms
|
| 37 |
+
|
| 38 |
+
/**
|
| 39 |
+
* Maximum duration of audio that can be handled by Moonshine
|
| 40 |
+
*/
|
| 41 |
+
export const MAX_BUFFER_DURATION = 30;
|
| 42 |
+
|
| 43 |
+
/**
|
| 44 |
+
* Size of the incoming buffers
|
| 45 |
+
*/
|
| 46 |
+
export const NEW_BUFFER_SIZE = 512;
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* The number of previous buffers to keep, to ensure the audio is padded correctly
|
| 50 |
+
*/
|
| 51 |
+
export const MAX_NUM_PREV_BUFFERS = Math.ceil(
|
| 52 |
+
SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE,
|
| 53 |
+
);
|
src/index.css
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@import "tailwindcss";
|
| 2 |
+
|
| 3 |
+
@keyframes ripple {
|
| 4 |
+
from {
|
| 5 |
+
transform: scale(1);
|
| 6 |
+
opacity: 0.7;
|
| 7 |
+
}
|
| 8 |
+
to {
|
| 9 |
+
transform: scale(2);
|
| 10 |
+
opacity: 0;
|
| 11 |
+
}
|
| 12 |
+
}
|
src/main.jsx
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { StrictMode } from "react";
|
| 2 |
+
import { createRoot } from "react-dom/client";
|
| 3 |
+
import "./index.css";
|
| 4 |
+
import App from "./App.jsx";
|
| 5 |
+
|
| 6 |
+
createRoot(document.getElementById("root")).render(
|
| 7 |
+
<StrictMode>
|
| 8 |
+
<App />
|
| 9 |
+
</StrictMode>,
|
| 10 |
+
);
|
src/play-worklet.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default () => {
|
| 2 |
+
class BufferedAudioWorkletProcessor extends AudioWorkletProcessor {
|
| 3 |
+
constructor() {
|
| 4 |
+
super();
|
| 5 |
+
this.bufferQueue = [];
|
| 6 |
+
this.currentChunkOffset = 0;
|
| 7 |
+
this.hadData = false;
|
| 8 |
+
|
| 9 |
+
this.port.onmessage = (event) => {
|
| 10 |
+
const data = event.data;
|
| 11 |
+
if (data instanceof Float32Array) {
|
| 12 |
+
this.hadData = true;
|
| 13 |
+
this.bufferQueue.push(data);
|
| 14 |
+
} else if (data === "stop") {
|
| 15 |
+
this.bufferQueue = [];
|
| 16 |
+
this.currentChunkOffset = 0;
|
| 17 |
+
}
|
| 18 |
+
};
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
process(inputs, outputs) {
|
| 22 |
+
const channel = outputs[0][0];
|
| 23 |
+
if (!channel) return true;
|
| 24 |
+
|
| 25 |
+
const numSamples = channel.length;
|
| 26 |
+
let outputIndex = 0;
|
| 27 |
+
|
| 28 |
+
if (this.hadData && this.bufferQueue.length === 0) {
|
| 29 |
+
this.port.postMessage({ type: "playback_ended" });
|
| 30 |
+
this.hadData = false;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
while (outputIndex < numSamples) {
|
| 34 |
+
if (this.bufferQueue.length > 0) {
|
| 35 |
+
const currentChunk = this.bufferQueue[0];
|
| 36 |
+
const remainingSamples =
|
| 37 |
+
currentChunk.length - this.currentChunkOffset;
|
| 38 |
+
const samplesToCopy = Math.min(
|
| 39 |
+
remainingSamples,
|
| 40 |
+
numSamples - outputIndex,
|
| 41 |
+
);
|
| 42 |
+
|
| 43 |
+
channel.set(
|
| 44 |
+
currentChunk.subarray(
|
| 45 |
+
this.currentChunkOffset,
|
| 46 |
+
this.currentChunkOffset + samplesToCopy,
|
| 47 |
+
),
|
| 48 |
+
outputIndex,
|
| 49 |
+
);
|
| 50 |
+
|
| 51 |
+
this.currentChunkOffset += samplesToCopy;
|
| 52 |
+
outputIndex += samplesToCopy;
|
| 53 |
+
|
| 54 |
+
// Remove the chunk if fully consumed.
|
| 55 |
+
if (this.currentChunkOffset >= currentChunk.length) {
|
| 56 |
+
this.bufferQueue.shift();
|
| 57 |
+
this.currentChunkOffset = 0;
|
| 58 |
+
}
|
| 59 |
+
} else {
|
| 60 |
+
// If no data is available, fill the rest of the buffer with silence.
|
| 61 |
+
channel.fill(0, outputIndex);
|
| 62 |
+
outputIndex = numSamples;
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
return true;
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
registerProcessor(
|
| 70 |
+
"buffered-audio-worklet-processor",
|
| 71 |
+
BufferedAudioWorkletProcessor,
|
| 72 |
+
);
|
| 73 |
+
};
|
src/vad-processor.js
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const MIN_CHUNK_SIZE = 512;
|
| 2 |
+
let globalPointer = 0;
|
| 3 |
+
let globalBuffer = new Float32Array(MIN_CHUNK_SIZE);
|
| 4 |
+
|
| 5 |
+
class VADProcessor extends AudioWorkletProcessor {
|
| 6 |
+
process(inputs, outputs, parameters) {
|
| 7 |
+
const buffer = inputs[0][0];
|
| 8 |
+
if (!buffer) return; // buffer is null when the stream ends
|
| 9 |
+
|
| 10 |
+
if (buffer.length > MIN_CHUNK_SIZE) {
|
| 11 |
+
// If the buffer is larger than the minimum chunk size, send the entire buffer
|
| 12 |
+
this.port.postMessage({ buffer });
|
| 13 |
+
} else {
|
| 14 |
+
const remaining = MIN_CHUNK_SIZE - globalPointer;
|
| 15 |
+
if (buffer.length >= remaining) {
|
| 16 |
+
// If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space
|
| 17 |
+
globalBuffer.set(buffer.subarray(0, remaining), globalPointer);
|
| 18 |
+
|
| 19 |
+
// Send the global buffer
|
| 20 |
+
this.port.postMessage({ buffer: globalBuffer });
|
| 21 |
+
|
| 22 |
+
// Reset the global buffer and set the remaining buffer
|
| 23 |
+
globalBuffer.fill(0);
|
| 24 |
+
globalBuffer.set(buffer.subarray(remaining), 0);
|
| 25 |
+
globalPointer = buffer.length - remaining;
|
| 26 |
+
} else {
|
| 27 |
+
// If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer
|
| 28 |
+
globalBuffer.set(buffer, globalPointer);
|
| 29 |
+
globalPointer += buffer.length;
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
return true; // Keep the processor alive
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
registerProcessor("vad-processor", VADProcessor);
|
src/worker.js
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import {
|
| 2 |
+
// VAD
|
| 3 |
+
AutoModel,
|
| 4 |
+
|
| 5 |
+
// LLM
|
| 6 |
+
AutoTokenizer,
|
| 7 |
+
AutoModelForCausalLM,
|
| 8 |
+
TextStreamer,
|
| 9 |
+
InterruptableStoppingCriteria,
|
| 10 |
+
|
| 11 |
+
// Speech recognition
|
| 12 |
+
Tensor,
|
| 13 |
+
pipeline,
|
| 14 |
+
} from "@huggingface/transformers";
|
| 15 |
+
|
| 16 |
+
import { KokoroTTS, TextSplitterStream } from "kokoro-js";
|
| 17 |
+
|
| 18 |
+
import {
|
| 19 |
+
MAX_BUFFER_DURATION,
|
| 20 |
+
INPUT_SAMPLE_RATE,
|
| 21 |
+
SPEECH_THRESHOLD,
|
| 22 |
+
EXIT_THRESHOLD,
|
| 23 |
+
SPEECH_PAD_SAMPLES,
|
| 24 |
+
MAX_NUM_PREV_BUFFERS,
|
| 25 |
+
MIN_SILENCE_DURATION_SAMPLES,
|
| 26 |
+
MIN_SPEECH_DURATION_SAMPLES,
|
| 27 |
+
} from "./constants";
|
| 28 |
+
|
| 29 |
+
const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX";
|
| 30 |
+
let voice;
|
| 31 |
+
const tts = await KokoroTTS.from_pretrained(model_id, {
|
| 32 |
+
dtype: "fp32",
|
| 33 |
+
device: "webgpu",
|
| 34 |
+
});
|
| 35 |
+
|
| 36 |
+
const device = "webgpu";
|
| 37 |
+
self.postMessage({ type: "info", message: `Using device: "${device}"` });
|
| 38 |
+
self.postMessage({
|
| 39 |
+
type: "info",
|
| 40 |
+
message: "Loading models...",
|
| 41 |
+
duration: "until_next",
|
| 42 |
+
});
|
| 43 |
+
|
| 44 |
+
// Load models
|
| 45 |
+
const silero_vad = await AutoModel.from_pretrained(
|
| 46 |
+
"onnx-community/silero-vad",
|
| 47 |
+
{
|
| 48 |
+
config: { model_type: "custom" },
|
| 49 |
+
dtype: "fp32", // Full-precision
|
| 50 |
+
},
|
| 51 |
+
).catch((error) => {
|
| 52 |
+
self.postMessage({ error });
|
| 53 |
+
throw error;
|
| 54 |
+
});
|
| 55 |
+
|
| 56 |
+
const DEVICE_DTYPE_CONFIGS = {
|
| 57 |
+
webgpu: {
|
| 58 |
+
encoder_model: "fp32",
|
| 59 |
+
decoder_model_merged: "fp32",
|
| 60 |
+
},
|
| 61 |
+
wasm: {
|
| 62 |
+
encoder_model: "fp32",
|
| 63 |
+
decoder_model_merged: "q8",
|
| 64 |
+
},
|
| 65 |
+
};
|
| 66 |
+
const transcriber = await pipeline(
|
| 67 |
+
"automatic-speech-recognition",
|
| 68 |
+
"onnx-community/whisper-base", // or "onnx-community/moonshine-base-ONNX",
|
| 69 |
+
{
|
| 70 |
+
device,
|
| 71 |
+
dtype: DEVICE_DTYPE_CONFIGS[device],
|
| 72 |
+
},
|
| 73 |
+
).catch((error) => {
|
| 74 |
+
self.postMessage({ error });
|
| 75 |
+
throw error;
|
| 76 |
+
});
|
| 77 |
+
|
| 78 |
+
await transcriber(new Float32Array(INPUT_SAMPLE_RATE)); // Compile shaders
|
| 79 |
+
|
| 80 |
+
const llm_model_id = "HuggingFaceTB/SmolLM2-1.7B-Instruct";
|
| 81 |
+
const tokenizer = await AutoTokenizer.from_pretrained(llm_model_id);
|
| 82 |
+
const llm = await AutoModelForCausalLM.from_pretrained(llm_model_id, {
|
| 83 |
+
dtype: "q4f16",
|
| 84 |
+
device: "webgpu",
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
const SYSTEM_MESSAGE = {
|
| 88 |
+
role: "system",
|
| 89 |
+
content:
|
| 90 |
+
"You're a helpful and conversational voice assistant. Keep your responses short, clear, and casual.",
|
| 91 |
+
};
|
| 92 |
+
await llm.generate({ ...tokenizer("x"), max_new_tokens: 1 }); // Compile shaders
|
| 93 |
+
|
| 94 |
+
let messages = [SYSTEM_MESSAGE];
|
| 95 |
+
let past_key_values_cache;
|
| 96 |
+
let stopping_criteria;
|
| 97 |
+
self.postMessage({
|
| 98 |
+
type: "status",
|
| 99 |
+
status: "ready",
|
| 100 |
+
message: "Ready!",
|
| 101 |
+
voices: tts.voices,
|
| 102 |
+
});
|
| 103 |
+
|
| 104 |
+
// Global audio buffer to store incoming audio
|
| 105 |
+
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE);
|
| 106 |
+
let bufferPointer = 0;
|
| 107 |
+
|
| 108 |
+
// Initial state for VAD
|
| 109 |
+
const sr = new Tensor("int64", [INPUT_SAMPLE_RATE], []);
|
| 110 |
+
let state = new Tensor("float32", new Float32Array(2 * 1 * 128), [2, 1, 128]);
|
| 111 |
+
|
| 112 |
+
// Whether we are in the process of adding audio to the buffer
|
| 113 |
+
let isRecording = false;
|
| 114 |
+
let isPlaying = false; // new flag
|
| 115 |
+
|
| 116 |
+
/**
|
| 117 |
+
* Perform Voice Activity Detection (VAD)
|
| 118 |
+
* @param {Float32Array} buffer The new audio buffer
|
| 119 |
+
* @returns {Promise<boolean>} `true` if the buffer is speech, `false` otherwise.
|
| 120 |
+
*/
|
| 121 |
+
async function vad(buffer) {
|
| 122 |
+
const input = new Tensor("float32", buffer, [1, buffer.length]);
|
| 123 |
+
|
| 124 |
+
const { stateN, output } = await silero_vad({ input, sr, state });
|
| 125 |
+
state = stateN; // Update state
|
| 126 |
+
|
| 127 |
+
const isSpeech = output.data[0];
|
| 128 |
+
|
| 129 |
+
// Use heuristics to determine if the buffer is speech or not
|
| 130 |
+
return (
|
| 131 |
+
// Case 1: We are above the threshold (definitely speech)
|
| 132 |
+
isSpeech > SPEECH_THRESHOLD ||
|
| 133 |
+
// Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
|
| 134 |
+
(isRecording && isSpeech >= EXIT_THRESHOLD)
|
| 135 |
+
);
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
/**
|
| 139 |
+
* Transcribe the audio buffer
|
| 140 |
+
* @param {Float32Array} buffer The audio buffer
|
| 141 |
+
* @param {Object} data Additional data
|
| 142 |
+
*/
|
| 143 |
+
const speechToSpeech = async (buffer, data) => {
|
| 144 |
+
isPlaying = true;
|
| 145 |
+
|
| 146 |
+
// 1. Transcribe the audio from the user
|
| 147 |
+
const text = await transcriber(buffer).then(({ text }) => text.trim());
|
| 148 |
+
if (["", "[BLANK_AUDIO]"].includes(text)) {
|
| 149 |
+
// If the transcription is empty or a blank audio, we skip the rest of the processing
|
| 150 |
+
return;
|
| 151 |
+
}
|
| 152 |
+
messages.push({ role: "user", content: text });
|
| 153 |
+
|
| 154 |
+
// Set up text-to-speech streaming
|
| 155 |
+
const splitter = new TextSplitterStream();
|
| 156 |
+
const stream = tts.stream(splitter, {
|
| 157 |
+
voice,
|
| 158 |
+
});
|
| 159 |
+
(async () => {
|
| 160 |
+
for await (const { text, phonemes, audio } of stream) {
|
| 161 |
+
self.postMessage({ type: "output", text, result: audio });
|
| 162 |
+
}
|
| 163 |
+
})();
|
| 164 |
+
|
| 165 |
+
// 2. Generate a response using the LLM
|
| 166 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
| 167 |
+
add_generation_prompt: true,
|
| 168 |
+
return_dict: true,
|
| 169 |
+
});
|
| 170 |
+
const streamer = new TextStreamer(tokenizer, {
|
| 171 |
+
skip_prompt: true,
|
| 172 |
+
skip_special_tokens: true,
|
| 173 |
+
callback_function: (text) => {
|
| 174 |
+
splitter.push(text);
|
| 175 |
+
},
|
| 176 |
+
token_callback_function: () => {},
|
| 177 |
+
});
|
| 178 |
+
|
| 179 |
+
stopping_criteria = new InterruptableStoppingCriteria();
|
| 180 |
+
const { past_key_values, sequences } = await llm.generate({
|
| 181 |
+
...inputs,
|
| 182 |
+
past_key_values: past_key_values_cache,
|
| 183 |
+
|
| 184 |
+
do_sample: false, // TODO: do_sample: true is bugged (invalid data location on topk sample)
|
| 185 |
+
max_new_tokens: 1024,
|
| 186 |
+
streamer,
|
| 187 |
+
stopping_criteria,
|
| 188 |
+
return_dict_in_generate: true,
|
| 189 |
+
});
|
| 190 |
+
past_key_values_cache = past_key_values;
|
| 191 |
+
|
| 192 |
+
// Finally, close the stream to signal that no more text will be added.
|
| 193 |
+
splitter.close();
|
| 194 |
+
|
| 195 |
+
const decoded = tokenizer.batch_decode(
|
| 196 |
+
sequences.slice(null, [inputs.input_ids.dims[1], null]),
|
| 197 |
+
{ skip_special_tokens: true },
|
| 198 |
+
);
|
| 199 |
+
|
| 200 |
+
messages.push({ role: "assistant", content: decoded[0] });
|
| 201 |
+
};
|
| 202 |
+
|
| 203 |
+
// Track the number of samples after the last speech chunk
|
| 204 |
+
let postSpeechSamples = 0;
|
| 205 |
+
const resetAfterRecording = (offset = 0) => {
|
| 206 |
+
self.postMessage({
|
| 207 |
+
type: "status",
|
| 208 |
+
status: "recording_end",
|
| 209 |
+
message: "Transcribing...",
|
| 210 |
+
duration: "until_next",
|
| 211 |
+
});
|
| 212 |
+
BUFFER.fill(0, offset);
|
| 213 |
+
bufferPointer = offset;
|
| 214 |
+
isRecording = false;
|
| 215 |
+
postSpeechSamples = 0;
|
| 216 |
+
};
|
| 217 |
+
|
| 218 |
+
const dispatchForTranscriptionAndResetAudioBuffer = (overflow) => {
|
| 219 |
+
// Get start and end time of the speech segment, minus the padding
|
| 220 |
+
const now = Date.now();
|
| 221 |
+
const end =
|
| 222 |
+
now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000;
|
| 223 |
+
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000;
|
| 224 |
+
const duration = end - start;
|
| 225 |
+
const overflowLength = overflow?.length ?? 0;
|
| 226 |
+
|
| 227 |
+
// Send the audio buffer to the worker
|
| 228 |
+
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES);
|
| 229 |
+
|
| 230 |
+
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0);
|
| 231 |
+
const paddedBuffer = new Float32Array(prevLength + buffer.length);
|
| 232 |
+
let offset = 0;
|
| 233 |
+
for (const prev of prevBuffers) {
|
| 234 |
+
paddedBuffer.set(prev, offset);
|
| 235 |
+
offset += prev.length;
|
| 236 |
+
}
|
| 237 |
+
paddedBuffer.set(buffer, offset);
|
| 238 |
+
speechToSpeech(paddedBuffer, { start, end, duration });
|
| 239 |
+
|
| 240 |
+
// Set overflow (if present) and reset the rest of the audio buffer
|
| 241 |
+
if (overflow) {
|
| 242 |
+
BUFFER.set(overflow, 0);
|
| 243 |
+
}
|
| 244 |
+
resetAfterRecording(overflowLength);
|
| 245 |
+
};
|
| 246 |
+
|
| 247 |
+
let prevBuffers = [];
|
| 248 |
+
self.onmessage = async (event) => {
|
| 249 |
+
const { type, buffer } = event.data;
|
| 250 |
+
|
| 251 |
+
// refuse new audio while playing back
|
| 252 |
+
if (type === "audio" && isPlaying) return;
|
| 253 |
+
|
| 254 |
+
switch (type) {
|
| 255 |
+
case "start_call": {
|
| 256 |
+
const name = tts.voices[voice ?? "af_heart"]?.name ?? "Heart";
|
| 257 |
+
greet(`Hey there, my name is ${name}! How can I help you today?`);
|
| 258 |
+
return;
|
| 259 |
+
}
|
| 260 |
+
case "end_call":
|
| 261 |
+
messages = [SYSTEM_MESSAGE];
|
| 262 |
+
past_key_values_cache = null;
|
| 263 |
+
case "interrupt":
|
| 264 |
+
stopping_criteria?.interrupt();
|
| 265 |
+
return;
|
| 266 |
+
case "set_voice":
|
| 267 |
+
voice = event.data.voice;
|
| 268 |
+
return;
|
| 269 |
+
case "playback_ended":
|
| 270 |
+
isPlaying = false;
|
| 271 |
+
return;
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
const wasRecording = isRecording; // Save current state
|
| 275 |
+
const isSpeech = await vad(buffer);
|
| 276 |
+
|
| 277 |
+
if (!wasRecording && !isSpeech) {
|
| 278 |
+
// We are not recording, and the buffer is not speech,
|
| 279 |
+
// so we will probably discard the buffer. So, we insert
|
| 280 |
+
// into a FIFO queue with maximum size of PREV_BUFFER_SIZE
|
| 281 |
+
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
| 282 |
+
// If the queue is full, we discard the oldest buffer
|
| 283 |
+
prevBuffers.shift();
|
| 284 |
+
}
|
| 285 |
+
prevBuffers.push(buffer);
|
| 286 |
+
return;
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
const remaining = BUFFER.length - bufferPointer;
|
| 290 |
+
if (buffer.length >= remaining) {
|
| 291 |
+
// The buffer is larger than (or equal to) the remaining space in the global buffer,
|
| 292 |
+
// so we perform transcription and copy the overflow to the global buffer
|
| 293 |
+
BUFFER.set(buffer.subarray(0, remaining), bufferPointer);
|
| 294 |
+
bufferPointer += remaining;
|
| 295 |
+
|
| 296 |
+
// Dispatch the audio buffer
|
| 297 |
+
const overflow = buffer.subarray(remaining);
|
| 298 |
+
dispatchForTranscriptionAndResetAudioBuffer(overflow);
|
| 299 |
+
return;
|
| 300 |
+
} else {
|
| 301 |
+
// The buffer is smaller than the remaining space in the global buffer,
|
| 302 |
+
// so we copy it to the global buffer
|
| 303 |
+
BUFFER.set(buffer, bufferPointer);
|
| 304 |
+
bufferPointer += buffer.length;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
if (isSpeech) {
|
| 308 |
+
if (!isRecording) {
|
| 309 |
+
// Indicate start of recording
|
| 310 |
+
self.postMessage({
|
| 311 |
+
type: "status",
|
| 312 |
+
status: "recording_start",
|
| 313 |
+
message: "Listening...",
|
| 314 |
+
duration: "until_next",
|
| 315 |
+
});
|
| 316 |
+
}
|
| 317 |
+
// Start or continue recording
|
| 318 |
+
isRecording = true;
|
| 319 |
+
postSpeechSamples = 0; // Reset the post-speech samples
|
| 320 |
+
return;
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
postSpeechSamples += buffer.length;
|
| 324 |
+
|
| 325 |
+
// At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
|
| 326 |
+
// So, we check whether we have reached the end of the current audio chunk.
|
| 327 |
+
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
| 328 |
+
// There was a short pause, but not long enough to consider the end of a speech chunk
|
| 329 |
+
// (e.g., the speaker took a breath), so we continue recording
|
| 330 |
+
return;
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
| 334 |
+
// The entire buffer (including the new chunk) is smaller than the minimum
|
| 335 |
+
// duration of a speech chunk, so we can safely discard the buffer.
|
| 336 |
+
resetAfterRecording();
|
| 337 |
+
return;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
dispatchForTranscriptionAndResetAudioBuffer();
|
| 341 |
+
};
|
| 342 |
+
|
| 343 |
+
function greet(text) {
|
| 344 |
+
isPlaying = true;
|
| 345 |
+
const splitter = new TextSplitterStream();
|
| 346 |
+
const stream = tts.stream(splitter, { voice });
|
| 347 |
+
(async () => {
|
| 348 |
+
for await (const { text: chunkText, audio } of stream) {
|
| 349 |
+
self.postMessage({ type: "output", text: chunkText, result: audio });
|
| 350 |
+
}
|
| 351 |
+
})();
|
| 352 |
+
splitter.push(text);
|
| 353 |
+
splitter.close();
|
| 354 |
+
messages.push({ role: "assistant", content: text });
|
| 355 |
+
}
|
vite.config.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { defineConfig } from "vite";
|
| 2 |
+
import react from "@vitejs/plugin-react";
|
| 3 |
+
import tailwindcss from "@tailwindcss/vite";
|
| 4 |
+
|
| 5 |
+
// https://vite.dev/config/
|
| 6 |
+
export default defineConfig({
|
| 7 |
+
plugins: [tailwindcss(), react()],
|
| 8 |
+
build: {
|
| 9 |
+
target: "esnext",
|
| 10 |
+
},
|
| 11 |
+
worker: {
|
| 12 |
+
format: "es",
|
| 13 |
+
},
|
| 14 |
+
resolve: {
|
| 15 |
+
// Only bundle a single instance of Transformers.js
|
| 16 |
+
// (shared by `@huggingface/transformers` and `kokoro-js`)
|
| 17 |
+
dedupe: ["@huggingface/transformers"],
|
| 18 |
+
},
|
| 19 |
+
});
|