Comments (14)
we have provided grpc example in runtime, or you can wrap your own api
from cosyvoice.
Try this example:
Install fastapi first: pip install fastapi
, then fastapi dev --port 3001
import io,time
from fastapi import FastAPI, Response
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
import torchaudio
cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT')
# sft usage
print(cosyvoice.list_avaliable_spks())
app = FastAPI()
@app.get("/api/voice/tts")
async def tts(query: str):
start = time.process_time()
output = cosyvoice.inference_sft(query, '中文女')
end = time.process_time()
print("infer time:", end-start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(content=buffer.read(-1), media_type="audio/wav")
@app.get("/")
async def root():
return {"message": "Hello World"}
from cosyvoice.
Try this example:
Install fastapi first:
pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}
we will be happy if you can make a fastapi pr
from cosyvoice.
Try this example:
Install fastapi first:pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}we will be happy if you can make a fastapi pr
Ok, let me create PR.
from cosyvoice.
thank you
from cosyvoice.
@iflamed why not gradio_client ...
from cosyvoice.
能制作个API吗
from cosyvoice.
同求,看官方提供的grpc api使用有些复杂
from cosyvoice.
Try this example:
Install fastapi first:
pip install fastapi
, thenfastapi dev --port 3001
import io,time from fastapi import FastAPI, Response from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav import torchaudio cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-SFT') # sft usage print(cosyvoice.list_avaliable_spks()) app = FastAPI() @app.get("/api/voice/tts") async def tts(query: str): start = time.process_time() output = cosyvoice.inference_sft(query, '中文女') end = time.process_time() print("infer time:", end-start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(content=buffer.read(-1), media_type="audio/wav") @app.get("/") async def root(): return {"message": "Hello World"}
如果是inference_zero_shot需要上传音频的需要怎么封装呢
from cosyvoice.
from cosyvoice.
已封装,供参考(代码说明可以参考 https://blog.csdn.net/weixin_42357472/article/details/140321056):
api.py
import time
import io, os, sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}'.format(ROOT_DIR))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
import numpy as np
from flask import Flask, request, Response
import torch
import torchaudio
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
##指定模型地址,对于零样本/跨语言推理,请使用CosyVoice-300M模型。对于SFT推理,
##请使用CosyVoice-300M-SFT模型。对于指令推理,请使用CosyVoice-300M-Instruct模型。
cosyvoice = CosyVoice('/data/pretrained_models/CosyVoice-300M')
print(cosyvoice.list_avaliable_spks())
app = Flask(__name__)
@app.route("/inference/sft", methods=['POST'])
def sft():
question_data = request.get_json()
query = question_data.get('query')
speaker = question_data.get('speaker')
if not query:
return {"error": "Query parameter 'query' is required"}, 400
start = time.process_time()
output = cosyvoice.inference_sft(query, speaker)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/zero_shot", methods=['POST'])
def zero_shot():
question_data = request.get_json()
tts_text = question_data.get('query')
prompt_text = question_data.get('prompt_text')
prompt_speech = load_wav(question_data.get('prompt_speech'), 16000)
prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
prompt_speech_16k = prompt_speech_16k.float() / (2**15)
start = time.process_time()
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/cross_lingual", methods=['POST'])
def cross_lingual():
question_data = request.get_json()
tts_text = question_data.get('query')
prompt_speech = load_wav(question_data.get('prompt_speech'), 16000)
prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes()
prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0)
prompt_speech_16k = prompt_speech_16k.float() / (2**15)
start = time.process_time()
output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
@app.route("/inference/instruct", methods=['POST'])
def instruct():
question_data = request.get_json()
tts_text = question_data.get('query')
speaker = question_data.get('speaker')
instruct_text = question_data.get('instruct_text')
start = time.process_time()
output = cosyvoice.inference_instruct(tts_text, speaker, instruct_text)
end = time.process_time()
print("infer time:", end - start)
buffer = io.BytesIO()
torchaudio.save(buffer, output['tts_speech'], 22050, format="wav")
buffer.seek(0)
return Response(buffer.read(), mimetype="audio/wav")
if __name__ == "__main__":
app.run(host='0.0.0.0', port=50000,)
from cosyvoice.
已封装,供参考,代码说明可以参考 https://blog.csdn.net/weixin_42357472/article/details/140321056
api.py
import time import io, os, sys ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append('{}'.format(ROOT_DIR)) sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR)) import numpy as np from flask import Flask, request, Response import torch import torchaudio from cosyvoice.cli.cosyvoice import CosyVoice from cosyvoice.utils.file_utils import load_wav ##指定模型地址,对于零样本/跨语言推理,请使用CosyVoice-300M模型。对于SFT推理, ##请使用CosyVoice-300M-SFT模型。对于指令推理,请使用CosyVoice-300M-Instruct模型。 cosyvoice = CosyVoice('/data/pretrained_models/CosyVoice-300M') print(cosyvoice.list_avaliable_spks()) app = Flask(__name__) @app.route("/inference/sft", methods=['POST']) def sft(): question_data = request.get_json() query = question_data.get('query') speaker = question_data.get('speaker') if not query: return {"error": "Query parameter 'query' is required"}, 400 start = time.process_time() output = cosyvoice.inference_sft(query, speaker) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/zero_shot", methods=['POST']) def zero_shot(): question_data = request.get_json() tts_text = question_data.get('query') prompt_text = question_data.get('prompt_text') prompt_speech = load_wav(question_data.get('prompt_speech'), 16000) prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) prompt_speech_16k = prompt_speech_16k.float() / (2**15) start = time.process_time() output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/cross_lingual", methods=['POST']) def cross_lingual(): question_data = request.get_json() tts_text = question_data.get('query') prompt_speech = load_wav(question_data.get('prompt_speech'), 16000) prompt_audio = (prompt_speech.numpy() * (2**15)).astype(np.int16).tobytes() prompt_speech_16k = torch.from_numpy(np.array(np.frombuffer(prompt_audio, dtype=np.int16))).unsqueeze(dim=0) prompt_speech_16k = prompt_speech_16k.float() / (2**15) start = time.process_time() output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") @app.route("/inference/instruct", methods=['POST']) def instruct(): question_data = request.get_json() tts_text = question_data.get('query') speaker = question_data.get('speaker') instruct_text = question_data.get('instruct_text') start = time.process_time() output = cosyvoice.inference_instruct(tts_text, speaker, instruct_text) end = time.process_time() print("infer time:", end - start) buffer = io.BytesIO() torchaudio.save(buffer, output['tts_speech'], 22050, format="wav") buffer.seek(0) return Response(buffer.read(), mimetype="audio/wav") if __name__ == "__main__": app.run(host='0.0.0.0', port=50000,)
from cosyvoice.
请问该api支持流式输出吗
from cosyvoice.
from cosyvoice.
Related Issues (20)
- 大家有没有发现文本中使用了中文问号后,问号后面的文本就直接不合成了 HOT 2
- There is no such entity as cosyvoice.utils.common.ras_sampling HOT 13
- docker run fastapi can't start HOT 2
- 【api】希望流式效果可以提供个允许直接调用的API出来 HOT 2
- M2mac无cuda,更新后生成提示错误,但生成完可自动播放 HOT 1
- 非流式输出为何要用yield? HOT 2
- 是否支持训练新的方言,比如**的闽南语? HOT 1
- 如何让模型按照英文大写顺序读 HOT 1
- TransformerLM.__init__() missing 1 required positional argument: 'sampling' HOT 2
- 微调 HOT 1
- Some questions about flow HOT 2
- 分享一下pypi包,一键运行cosyvoice HOT 2
- 代码更新到最新版本之后报错 HOT 2
- 依照安装说明安装后尝试开启cosy voice-instruct webui发生报错 HOT 2
- 如何提升日语tts效果啊? HOT 2
- 流式推理总感觉有爆破音如何解决 HOT 2
- Use Uyghur converted to Latin to train the LLM model HOT 1
- fade_in_out error when stream=True HOT 1
- Add a new language but the result is a meaningless audio. HOT 2
- 多音字问题, "嘎"字无法发出 gá 这个声音
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from cosyvoice.