-
Notifications
You must be signed in to change notification settings - Fork 7
/
run_seg_asr_filter1.sh
111 lines (90 loc) · 4.78 KB
/
run_seg_asr_filter1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env bash
gpu_ids=0,1,2,3,4,5,6,7
support_language="zh en" # the language support for now.
other_format=flv
stage=0
stop_stage=4
. ./utils/parse_options.sh || exit 1;
ROOT="$(cd "$(dirname "$0")" && pwd)"
audio_dir=$1 # 原始音频wav/视频mp4路径
output_dir=$2 # 切分转写筛选后的数据路径
mkdir -p $output_dir
echo "# 转写的音频绝对路径为:$audio_dir"
echo "# 最终的保存音频路径为:$output_dir"
result_dir=$output_dir/whisper_transcript # 第一遍转写结果路径
segment_dir=$output_dir/whisper_segment # 切分后音频保存路径
data_dir=$output_dir/data # 切分音频kaldi格式数据路径
data_acc95_dir=$data_dir/delins2 # 筛选后kaldi格式数据路径
# convert other format to wav
if [ $stage -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "# 第-1步,转换其他音频的格式,需要指定音频格式: other_format=$other_format"
find $audio_dir -type f \( -name "*.$other_format" \) | awk -F"/" -v name="" \
-v root=$audio_dir '{name=$0; gsub(root,"",name); gsub("/","_",name); print name"\t"$0 }' | sort > $output_dir/$other_format.scp
bash $ROOT/clients/audio/convert2wav.sh $output_dir/$other_format.scp
fi
# prepare wav.scp, find all wav and mp4 files \( -name "*.wav" -o -name "*.mp4" \)
if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "# 第0.0步, 将转写文件调整格式列到wav.scp文件中"
find $audio_dir -type f | awk -F"/" -v name="" \
-v root=$audio_dir '{name=$0; gsub(root,"",name); gsub("/","_",name); print name"\t"$0 }' > $output_dir/wav.scp
# echo "# 第0.1步, 去除音频路径中带有的空格,将空格替换成-, 文件名限定在15个字以内"
# python3 $ROOT/clients/audio/rm_space_in_path.py $output_dir/wav.scp
# echo "# 第0.2步, 重新把全部转写文件路径列入到wav.scp中"
# find $audio_dir -type f | awk -F"/" -v name="" \
# -v root=$audio_dir '{name=$0; gsub(root,"",name); gsub("/","_",name); print name"\t"$0 }' > $output_dir/wav.scp
fi
# whisper transcribe, get segment timestamp and text
# use whisperX, large-v3 model, batch=32 consume ~ 13GB GPU-RAM
if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "# 第1步,使用WhisperX large-v3模型转写。"
python3 $ROOT/utils/run_whisperX1.py \
-i $output_dir/wav.scp \
-o $result_dir \
-g $gpu_ids \
-b 32
fi
# segment the audio/video with whisper segments
if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "# 第2步,根据转写得到的时间戳对音视频进行切分,得到切分后的wav。"
for lang in $support_language ; do
python3 $ROOT/utils/segment_whisper_tsv.py \
-i $output_dir/wav.scp \
-t $result_dir/$lang \
-o $segment_dir/$lang \
-n 48
done
fi
# prepare wav.scp, text... find all segments.
if [ $stage -le 3 ] && [ ${stop_stage} -ge 3 ]; then
for lang in $support_language; do
mkdir -p $data_dir/$lang
echo "# 第3步,准备转写所需的kaldi格式数据,至少包含wav.scp和text,过滤掉[0.5, 40]s之外的音频段"
find $segment_dir/$lang -type f -name "*.wav" | awk -F"/" -v name="" \
'{name=$NF; gsub(".wav","",name); print name"\t"$0 }' | sort > $data_dir/$lang/wav.scp
# cat $segment_dir/*/transcription.txt | sort > $data_dir/$lang/text
find $segment_dir/$lang -name "transcription.txt" -print0 | xargs -0 cat > $data_dir/$lang/text
find $segment_dir/$lang -type f -name "*.wav" | awk -F"/" -v name="" \
'{name=$NF; gsub(".wav","",name); print name"\t"$(NF-1) }' | sort > $data_dir/$lang/utt2spk
bash $ROOT/utils/wav_to_duration.sh --nj 48 $data_dir/$lang/wav.scp $data_dir/$lang/wav2dur
mkdir -p $data_dir/$lang/backup
mv $data_dir/$lang/* $data_dir/$lang/backup
# 将时长超过40s的音频都过滤掉,是whisper转写结果有问题的音频拼起来时长会比较长。
cat ${data_dir}/$lang/backup/wav2dur | awk '{if($2<=40 && $2>=0.5) print $0}' > ${data_dir}/$lang/wav2dur
for f in wav.scp text utt2spk; do
perl $ROOT/utils/filter_scp.pl ${data_dir}/$lang/wav2dur ${data_dir}/$lang/backup/$f > ${data_dir}/$lang/$f
done
perl $ROOT/utils/utt2spk_to_spk2utt.pl $data_dir/$lang/utt2spk > $data_dir/$lang/spk2utt
done
fi
if [ $stage -le 4 ] && [ ${stop_stage} -ge 4 ]; then
for lang in $support_language; do
mkdir -p $data_acc95_dir/$lang
echo "# 第4步,使用Paraformer模型进行转写,并计算CER,筛选出插入错误+删除错误小于2,且CER<=30%的数据"
bash $ROOT/utils/infer_paraformer.sh \
--stage 1 --stop_stage 3 \
--language $lang \
--gpuid_list $gpu_ids \
--batch_size 16 \
$data_dir/$lang $data_acc95_dir/$lang
done
fi