我有一個由 FastAPI 創建的簡單 WebSocket 專案,如下代碼:
import uvicorn
from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
import numpy as np
import soundfile as sf
app = FastAPI()
html = """
<!DOCTYPE html>
<html>
<body>
<h1>Transcribe Audio With FastAPI</h1>
<p id="status">Connection status will go here</p>
<p id="transcript"></p>
<script>
navigator.mediaDevices.getUserMedia({ audio: { sampleSize: 16, channelCount: 1, sampleRate: 16000 } }).then((stream) => {
if (!MediaRecorder.isTypeSupported('audio/webm'))
return alert('Browser not supported')
const mediaRecorder = new MediaRecorder(stream, {
mimeType: 'audio/webm',
})
const socket = new WebSocket('ws://localhost:8000/listen')
socket.onopen = () => {
document.querySelector('#status').textContent = 'Connected'
console.log({ event: 'onopen' })
mediaRecorder.addEventListener('dataavailable', async (event) => {
if (event.data.size > 0 && socket.readyState == 1) {
socket.send(event.data)
}
})
mediaRecorder.start(250)
}
socket.onclose = () => {
console.log({ event: 'onclose' })
}
socket.onerror = (error) => {
console.log({ event: 'onerror', error })
}
})
</script>
</body>
</html>
"""
@app.get("/")
async def get():
return HTMLResponse(html)
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
try:
while True:
data = await websocket.receive_bytes()
print(data)
# Convert data to numpy array
# rest of the process!
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
if __name__ == '__main__':
uvicorn.run(app)
運行專案后,我想將資料轉換為 numpy 陣列。
我試過的:1)
def tensorize(x):
arr = np.frombuffer(x, dtype=np.float32)
# copy to avoid warning
arr = np.copy(arr)
return arr
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
print("I'm here websocket_endpoint")
await websocket.accept()
try:
# deepgram_socket = await process_audio(websocket)
whole = []
counter = 0
while True:
data = await websocket.receive_bytes()
array = tensorize(data)
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
引發錯誤:
arr = np.frombuffer(x, dtype=np.float32)
ValueError: buffer size must be a multiple of element size
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
print("I'm here websocket_endpoint")
await websocket.accept()
try:
# deepgram_socket = await process_audio(websocket)
whole = []
counter = 0
while True:
data = await websocket.receive_bytes()
data_s16 = np.frombuffer(data, dtype=np.int16, count=len(data) // 2, offset=0)
float_data = data_s16 * 0.5 ** 15
whole.append(float_data)
print(data)
counter = 1
if counter > 20:
data = np.concatenate(whole)
sf.write('stereo_file1.wav', data, 16000, 'PCM_24')
break
print(counter)
# await websocket.send_text(f"Message text was: {data}")
# deepgram_socket.send(data)
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
此示例代碼不會引發任何錯誤,但輸出音頻檔案不包含任何可感知的音頻。只是節省了噪音。
- 嘗試使用 librosa & soundfile 讀取位元組 io,但無法識別格式
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
try:
while True:
data = await websocket.receive_bytes()
byte_io = BytesIO(data)
array, sr = librosa.load(byte_io)
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
try:
while True:
data = await websocket.receive_bytes()
byte_io = BytesIO(data)
array, sr = sf.read(byte_io)
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
引發錯誤:
Exception: Could not process audio: Error opening <_io.BytesIO object at 0x7f12a32cd0d0>: Format not recognised.
'''
**Update 1**
I was able to save the outputted chunk using the following code, but the audio should be created in the hard drive and then loaded using librosa, which is so slow!
匯入 librosa @app.websocket("/listen") async def websocket_endpoint(websocket: WebSocket): print("我在這里 websocket_endpoint") await websocket.accept()
try:
while True:
data = await websocket.receive_bytes()
with open('audio.wav', 'wb') as f:
f.write(data)
array, sr = librosa.load("audio.wav")
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
uj5u.com熱心網友回復:
最后,通過在 pytorch/audio github 存盤庫上創建一個類似的問題。我在這條評論中從moto那里得到了答案。最終解決方案如下:
@app.websocket("/listen")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
try:
chunk_size = 1000
while True:
data = await websocket.receive_bytes()
f = BytesIO(data)
s = torchaudio.io.StreamReader(f)
s.add_basic_audio_stream(chunk_size)
array = torch.concat([chunk[0] for chunk in s.stream()])
except Exception as e:
raise Exception(f'Could not process audio: {e}')
finally:
await websocket.close()
我無法將資料轉換為 numpy 陣列,但array.numpy()如果有人需要,它會回傳 numpy 格式。
PS: 相關庫的版本:
[pip3] numpy==1.23.4
[pip3] torch==1.12.1
[pip3] torchaudio==0.12.1
[conda] numpy 1.23.4 pypi_0 pypi
作業系統:
Ubuntu: 22.04
torchaudio.backend: sox_io
轉載請註明出處,本文鏈接:https://www.uj5u.com/yidong/519654.html
