ESPnet: end-to-end speech processing toolkit
ESPnet is an end-to-end speech processing toolkit covering many speech-related tasks.
Easy to install
pip install espnet
for easy install.See this instruction for more details.
Supports many tasks
We provide a complete setup for various speech processing tasks.
ASR: Automatic Speech Recognition
TTS: Text-to-speech
SE: Speech enhancement (and separation)
SUM: Speech Summarization
SVS: Singing Voice Synthesis
SSL: Self-supervised Learning
UASR: Unsupervised ASR
EURO: ESPnet Unsupervised Recognition - Open-source
S2T: Speech-to-text with Whisper-style multilingual multitask models
More Documents
You can find tutorials on ESPnet packages.
Citations
@inproceedings{watanabe2018espnet,
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
title={{ESPnet}: End-to-End Speech Processing Toolkit},
year={2018},
booktitle={Proceedings of Interspeech},
pages={2207--2211},
doi={10.21437/Interspeech.2018-1456},
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
}
@inproceedings{hayashi2020espnet,
title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={7654--7658},
year={2020},
organization={IEEE}
}
@inproceedings{inaguma-etal-2020-espnet,
title = "{ESP}net-{ST}: All-in-One Speech Translation Toolkit",
author = "Inaguma, Hirofumi and
Kiyono, Shun and
Duh, Kevin and
Karita, Shigeki and
Yalta, Nelson and
Hayashi, Tomoki and
Watanabe, Shinji",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/2020.acl-demos.34",
pages = "302--311",
}
@article{hayashi2021espnet2,
title={Espnet2-tts: Extending the edge of tts research},
author={Hayashi, Tomoki and Yamamoto, Ryuichi and Yoshimura, Takenori and Wu, Peter and Shi, Jiatong and Saeki, Takaaki and Ju, Yooncheol and Yasuda, Yusuke and Takamichi, Shinnosuke and Watanabe, Shinji},
journal={arXiv preprint arXiv:2110.07840},
year={2021}
}
@inproceedings{li2020espnet,
title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
author={Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph Boeddeker and Zhuo Chen and Shinji Watanabe},
booktitle={Proceedings of IEEE Spoken Language Technology Workshop (SLT)},
pages={785--792},
year={2021},
organization={IEEE},
}
@inproceedings{arora2021espnet,
title={{ESPnet-SLU}: Advancing Spoken Language Understanding through ESPnet},
author={Arora, Siddhant and Dalmia, Siddharth and Denisov, Pavel and Chang, Xuankai and Ueda, Yushi and Peng, Yifan and Zhang, Yuekai and Kumar, Sujay and Ganesan, Karthik and Yan, Brian and others},
booktitle={ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={7167--7171},
year={2022},
organization={IEEE}
}
@inproceedings{shi2022muskits,
author={Shi, Jiatong and Guo, Shuai and Qian, Tao and Huo, Nan and Hayashi, Tomoki and Wu, Yuning and Xu, Frank and Chang, Xuankai and Li, Huazhe and Wu, Peter and Watanabe, Shinji and Jin, Qin},
title={{Muskits}: an End-to-End Music Processing Toolkit for Singing Voice Synthesis},
year={2022},
booktitle={Proceedings of Interspeech},
pages={4277-4281},
url={https://www.isca-speech.org/archive/pdfs/interspeech_2022/shi22d_interspeech.pdf}
}
@inproceedings{lu22c_interspeech,
author={Yen-Ju Lu and Xuankai Chang and Chenda Li and Wangyou Zhang and Samuele Cornell and Zhaoheng Ni and Yoshiki Masuyama and Brian Yan and Robin Scheibler and Zhong-Qiu Wang and Yu Tsao and Yanmin Qian and Shinji Watanabe},
title={{ESPnet-SE : Speech Enhancement for Robust Speech Recognition, Translation, and Understanding}},
year=2022,
booktitle={Proc. Interspeech 2022},
pages={5458--5462},
}
@article{gao2022euro,
title={{EURO}: {ESPnet} Unsupervised ASR Open-source Toolkit},
author={Gao, Dongji and Shi, Jiatong and Chuang, Shun-Po and Garcia, Leibny Paola and Lee, Hung-yi and Watanabe, Shinji and Khudanpur, Sanjeev},
journal={arXiv preprint arXiv:2211.17196},
year={2022}
}
@article{peng2023reproducing,
title={Reproducing Whisper-Style Training Using an Open-Source Toolkit and Publicly Available Data},
author={Peng, Yifan and Tian, Jinchuan and Yan, Brian and Berrebbi, Dan and Chang, Xuankai and Li, Xinjian and Shi, Jiatong and Arora, Siddhant and Chen, William and Sharma, Roshan and others},
journal={arXiv preprint arXiv:2309.13876},
year={2023}
}