-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.sh
220 lines (192 loc) · 8.92 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# extral
cd docopt-0.6.2
python setup.py install
pip install num2words-0.5.12-py3-none-any.whl
# activate enviroment
conda activate py39
export PYTHONPATH=/home/zhhao/fairseq:$PYTHONPATH
cd /home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/train
# test vicuna-7b
python3 -m fastchat.serve.cli --model-path /home/zhhao/llm_model/vicuna-7b --load-8bit
# data prepare, stage1 and stage2
cd ./preprocess
python prep_librispeech_data.py --data-root /home/zhhao/data_source/SLR12/ --tgt-dir ../data/librispeech --mode train
python prep_librispeech_data.py --data-root /home/zhhao/data_source/SLR12/ --tgt-dir ../data/librispeech --mode test
python prep_mustc_raw.py --data-root /home/zhhao/data_source/MUST-C/ --tgt-dir ../data/mustc --languages es
CUDA_VISIBLE_DEVICES=1, python filter_tsv.py --dataset_name 'LIBRISPEECH' --tsv_root ../data/librispeech --asr_batch_size 36 \
--asr_wer_threshold 0.0 --max_example_number 50000
CUDA_VISIBLE_DEVICES=1, python filter_tsv.py --dataset_name 'MUSTC' --tsv_root ../data/mustc/en-de --asr_batch_size 36 \
--asr_wer_threshold 0.0 --max_example_number 50000
# mt train
llm_model=/home/zhhao/llm_model/vicuna-7b
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-de/
save_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/mt/mustc/en-de/run1
torchrun --nnodes=4 --nproc_per_node=4 --master_port=12345 --node_rank=3 --master_addr="192.168.1.35" \
train_mt.py \
--model_name_or_path ${llm_model} \
--data_path ${data_path} \
--data_split_train 'train' \
--data_split_eval 'dev' \
--output_dir ${save_path} \
--num_train_epochs 3 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 8 \
--evaluation_strategy "steps" \
--eval_steps 1000 \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 10 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--gradient_checkpointing False \
--seed 1234 \
--fp16 True \
--deepspeed ../configs/deepspeed_config.json
# stage1 train
# activate enviroment
llm_model=/home/zhhao/llm_model/llama2/13b
ssl_model=/home/zhhao/ssl_model/20230627/wav2vec_vox_960h_pl.pt
#ssl_model=/home/zhhao/ssl_model/wav2vec_960/libri960_big.pt
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-es/
#data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/librispeech
save_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/en-es/stage1/run2
torchrun --nnodes=4 --nproc_per_node=4 --master_port=12345 --node_rank=0 --master_addr="192.168.1.35" \
stage1.py \
--model_name_or_path ${llm_model} \
--speech_tower_path ${ssl_model} \
--ssl_fintuned True \
--data_path ${data_path} \
--data_split_train 'train' \
--data_split_eval 'dev' \
--freeze_speech_foundation True \
--freeze_backbone True \
--only_tune_adapter True \
--output_dir ${save_path} \
--num_train_epochs 6 \
--per_device_train_batch_size 4 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "steps" \
--eval_steps 1000 \
--save_strategy "steps" \
--save_steps 1000 \
--save_total_limit 10 \
--learning_rate 2e-3 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--gradient_checkpointing True \
--seed 1234 \
--report_to none \
--fp16 True \
--deepspeed ../configs/deepspeed_config.json
# stage2 train
llm_model=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/en-fr/stage1/run2
#llm_model=/home/zhhao/llm_model/vicuna-7b
#ssl_model=/home/zhhao/ssl_model/wav2vec_960/wav2vec_small.pt
ssl_model=/home/zhhao/ssl_model/20230627/wav2vec_vox_960h_pl.pt
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-fr/
save_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/en-fr/stage2/run2
torchrun --nnodes=4 --nproc_per_node=4 --master_port=12345 --node_rank=3 --master_addr="192.168.1.35" \
stage2_large.py \
--model_name_or_path ${llm_model} \
--speech_tower_path ${ssl_model} \
--ssl_fintuned True \
--data_path ${data_path} \
--data_split_train 'train' \
--data_split_eval 'dev' \
--freeze_speech_foundation True \
--freeze_backbone False \
--only_tune_adapter False \
--output_dir ${save_path} \
--num_train_epochs 1 \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "steps" \
--eval_steps 100 \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 10 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--gradient_checkpointing True \
--seed 1234 \
--report_to none \
--fp16 True \
--deepspeed ../configs/deepspeed_config_stage3.json # deepspeed_config_stage2_offload.json deepspeed_config_stage3.json
# test single file
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/stage1/run6/checkpoint-5000
speech_file=/home/zhhao/data_source/SLR12/test-clean/1089/134686/1089-134686-0000.flac
CUDA_VISIBLE_DEVICES=1, python ./generate.py --model-name ${model_path} --speech-file ${speech_file}
# cli
cd ../server/
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/stage2/run7/checkpoint-2000
speech_file=/home/zhhao/data_source/SLR12/test-clean/1089/134686/1089-134686-0000.flac
CUDA_VISIBLE_DEVICES=0, python ./cli.py --model-path ${model_path} --load-8bit --speech-file ${speech_file}
How many speakers are there in this speech
what is the language of this speech
# test must-c mt result
cd ../eval/
lang=de
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/mt/mustc/en-de/run1/checkpoint-1000
#model_path=/home/zhhao/llm_model/llama2/7b-chat
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-${lang}
result=${model_path}/en-${lang}/result_beam1
CUDA_VISIBLE_DEVICES=0, python ./eval_mt.py --model-name ${model_path} --data-path ${data_path} --data-split 'tst-COMMON' --result ${result}
python ./compute_bleu.py ${result}/tst-COMMON
# mt
run1 mustc, en-de, 128, 2e-5, epoch 3, based on vicuna 7B,
# w/o finetune
llama 1, en-de 7B beam1:
vicuna 1.1, en-de 7B beam1:22.98 beam4: 24.46 13B beam1:24.32 beam4:25.35
vicuna 1.3, en-de 7B beam1:23.35 beam4: 24.89 13B beam1:24.62 beam4:25.96
vicuna 1.5, en-de 7B beam1:19.91 beam4: 19.98 13B beam1:25.62 beam4:26.34 (7b have some questions)
# w/ finetune
7B, checkpoint-1000 beam1:30.64 beam4:32.22
checkpoint-2000 beam1:30.44 beam4:31.87
# test dataset asr
cd ../eval/
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/stage1/run15
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/librispeech
result=${model_path}/result_beam4
CUDA_VISIBLE_DEVICES=1, python ./test_dataset_asr.py --model-name ${model_path} --data-path ${data_path} --data-split 'test-clean' --result ${result}
python ./compute_wer.py ${result}/test-clean
# test dataset st
# split tsv
# 7b
cd ../eval
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/stage2/run22/checkpoint-1700
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-de
result=${model_path}/result_beam4
CUDA_VISIBLE_DEVICES=3, python ./test_dataset.py --model-name ${model_path} --data-path ${data_path} --data-split 'tst-COMMON' --result ${result} --beam 4
# 13b or large
model_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/checkpoints/stage2/run22/checkpoint-1700
data_path=/home/zhhao/audioST/instruct_speech_llama/instruct_speech_new/data/mustc/en-de
python ./extract_adapter.py \
--model_name_or_path ${model_path} \
--extracted_name 'mm_length_adapter' \
--output ${model_path}/length_adapter.bin
python ./extract_adapter.py \
--model_name_or_path ${model_path} \
--extracted_name 'mm_mlp_adapter' \
--output ${model_path}/mlp_adapter.bin
cd ../eval/
result=${model_path}/result_beam4_20_50
CUDA_VISIBLE_DEVICES=0,1, python ./test_dataset_large.py \
--model-name ${model_path} \
--length-adapter-path ${model_path}/length_adapter.bin \
--mlp-adapter-path ${model_path}/mlp_adapter.bin \
--data-path ${data_path} \
--data-split 'tst-COMMON20_50' \
--result ${result} \
--beam 4
python ./compute_bleu.py ${result}/tst-COMMON20_50