From 4938765eb3f9808ae4e08a006bb4e88eedc55a86 Mon Sep 17 00:00:00 2001 From: Shivanshu Tyagi Date: Mon, 4 Oct 2021 23:48:02 +0530 Subject: [PATCH] Merge pull request #20291 from spazewalker:master speech recognition sample * speech recognition sample added.(initial commit) * fixed typos, removed plt * trailing whitespaces removed * masking removed and using opencv for displaying spectrogram * description added * requested changes and add opencl fp16 target * parenthesis and halide removed * workaround 3d matrix issue * handle multi channel audio support for multiple files at once * suggested changes fix whitespaces --- samples/dnn/speech_recognition.py | 506 ++++++++++++++++++++++++++++++ 1 file changed, 506 insertions(+) create mode 100644 samples/dnn/speech_recognition.py diff --git a/samples/dnn/speech_recognition.py b/samples/dnn/speech_recognition.py new file mode 100644 index 0000000000..025607edab --- /dev/null +++ b/samples/dnn/speech_recognition.py @@ -0,0 +1,506 @@ +import numpy as np +import cv2 as cv +import argparse +import os +import soundfile as sf # Temporary import to load audio files + +''' + You can download the converted onnx model from https://drive.google.com/drive/folders/1wLtxyao4ItAg8tt4Sb63zt6qXzhcQoR6?usp=sharing + or convert the model yourself. + + You can get the original pre-trained Jasper model from NVIDIA : https://ngc.nvidia.com/catalog/models/nvidia:jasper_pyt_onnx_fp16_amp/files + Download and unzip : `$ wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/jasper_pyt_onnx_fp16_amp/versions/20.10.0/zip -O jasper_pyt_onnx_fp16_amp_20.10.0.zip && unzip -o ./jasper_pyt_onnx_fp16_amp_20.10.0.zip && unzip -o ./jasper_pyt_onnx_fp16_amp.zip` + + you can get the script to convert the model here : https://gist.github.com/spazewalker/507f1529e19aea7e8417f6e935851a01 + + You can convert the model using the following steps: + 1. Import onnx and load the original model + ``` + import onnx + model = onnx.load("./jasper-onnx/1/model.onnx") + ``` + + 3. Change data type of input layer + ``` + inp = model.graph.input[0] + model.graph.input.remove(inp) + inp.type.tensor_type.elem_type = 1 + model.graph.input.insert(0,inp) + ``` + + 4. Change the data type of output layer + ``` + out = model.graph.output[0] + model.graph.output.remove(out) + out.type.tensor_type.elem_type = 1 + model.graph.output.insert(0,out) + ``` + + 5. Change the data type of every initializer and cast it's values from FP16 to FP32 + ``` + for i,init in enumerate(model.graph.initializer): + model.graph.initializer.remove(init) + init.data_type = 1 + init.raw_data = np.frombuffer(init.raw_data, count=np.product(init.dims), dtype=np.float16).astype(np.float32).tobytes() + model.graph.initializer.insert(i,init) + ``` + + 6. Add an additional reshape node to handle the inconsistant input from python and c++ of openCV. + see https://github.com/opencv/opencv/issues/19091 + Make & insert a new node with 'Reshape' operation & required initializer + ``` + tensor = numpy_helper.from_array(np.array([0,64,-1]),name='shape_reshape') + model.graph.initializer.insert(0,tensor) + node = onnx.helper.make_node(op_type='Reshape',inputs=['input__0','shape_reshape'], outputs=['input_reshaped'], name='reshape__0') + model.graph.node.insert(0,node) + model.graph.node[1].input[0] = 'input_reshaped' + ``` + + 7. Finally save the model + ``` + with open('jasper_dynamic_input_float.onnx','wb') as f: + onnx.save_model(model,f) + ``` + + Original Repo : https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechRecognition/Jasper + ''' + +class FilterbankFeatures: + def __init__(self, + sample_rate=16000, window_size=0.02, window_stride=0.01, + n_fft=512, preemph=0.97, n_filt=64, lowfreq=0, + highfreq=None, log=True, dither=1e-5): + ''' + Initializes pre-processing class. Default values are the values used by the Jasper + architecture for pre-processing. For more details, refer to the paper here: + https://arxiv.org/abs/1904.03288 + ''' + self.win_length = int(sample_rate * window_size) # frame size + self.hop_length = int(sample_rate * window_stride) # stride + self.n_fft = n_fft or 2 ** np.ceil(np.log2(self.win_length)) + self.log = log + self.dither = dither + self.n_filt = n_filt + self.preemph = preemph + highfreq = highfreq or sample_rate / 2 + self.window_tensor = np.hanning(self.win_length) + + self.filterbanks = self.mel(sample_rate, self.n_fft, n_mels=n_filt, fmin=lowfreq, fmax=highfreq) + self.filterbanks.dtype=np.float32 + self.filterbanks = np.expand_dims(self.filterbanks,0) + + def normalize_batch(self, x, seq_len): + ''' + Normalizes the features. + ''' + x_mean = np.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype) + x_std = np.zeros((seq_len.shape[0], x.shape[1]), dtype=x.dtype) + for i in range(x.shape[0]): + x_mean[i, :] = np.mean(x[i, :, :seq_len[i]],axis=1) + x_std[i, :] = np.std(x[i, :, :seq_len[i]],axis=1) + # make sure x_std is not zero + x_std += 1e-10 + return (x - np.expand_dims(x_mean,2)) / np.expand_dims(x_std,2) + + def calculate_features(self, x, seq_len): + ''' + Calculates filterbank features. + args: + x : mono channel audio + seq_len : length of the audio sample + returns: + x : filterbank features + ''' + dtype = x.dtype + + seq_len = np.ceil(seq_len / self.hop_length) + seq_len = np.array(seq_len,dtype=np.int32) + + # dither + if self.dither > 0: + x += self.dither * np.random.randn(*x.shape) + + # do preemphasis + if self.preemph is not None: + x = np.concatenate( + (np.expand_dims(x[0],-1), x[1:] - self.preemph * x[:-1]), axis=0) + + # Short Time Fourier Transform + x = self.stft(x, n_fft=self.n_fft, hop_length=self.hop_length, + win_length=self.win_length, + fft_window=self.window_tensor) + + # get power spectrum + x = (x**2).sum(-1) + + # dot with filterbank energies + x = np.matmul(np.array(self.filterbanks,dtype=x.dtype), x) + + # log features if required + if self.log: + x = np.log(x + 1e-20) + + # normalize if required + x = self.normalize_batch(x, seq_len).astype(dtype) + return x + + # Mel Frequency calculation + def hz_to_mel(self, frequencies): + ''' + Converts frequencies from hz to mel scale. Input can be a number or a vector. + ''' + frequencies = np.asanyarray(frequencies) + + f_min = 0.0 + f_sp = 200.0 / 3 + + mels = (frequencies - f_min) / f_sp + + # Fill in the log-scale part + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if frequencies.ndim: + # If we have array data, vectorize + log_t = frequencies >= min_log_hz + mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / logstep + elif frequencies >= min_log_hz: + # If we have scalar data, directly + mels = min_log_mel + np.log(frequencies / min_log_hz) / logstep + return mels + + def mel_to_hz(self, mels): + ''' + Converts frequencies from mel to hz scale. Input can be a number or a vector. + ''' + mels = np.asanyarray(mels) + + # Fill in the linear scale + f_min = 0.0 + f_sp = 200.0 / 3 + freqs = f_min + f_sp * mels + + # And now the nonlinear scale + min_log_hz = 1000.0 # beginning of log region (Hz) + min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels) + logstep = np.log(6.4) / 27.0 # step size for log region + + if mels.ndim: + # If we have vector data, vectorize + log_t = mels >= min_log_mel + freqs[log_t] = min_log_hz * np.exp(logstep * (mels[log_t] - min_log_mel)) + elif mels >= min_log_mel: + # If we have scalar data, check directly + freqs = min_log_hz * np.exp(logstep * (mels - min_log_mel)) + + return freqs + + def mel_frequencies(self, n_mels=128, fmin=0.0, fmax=11025.0): + ''' + Calculates n mel frequencies between 2 frequencies + args: + n_mels : number of bands + fmin : min frequency + fmax : max frequency + returns: + mels : vector of mel frequencies + ''' + # 'Center freqs' of mel bands - uniformly spaced between limits + min_mel = self.hz_to_mel(fmin) + max_mel = self.hz_to_mel(fmax) + + mels = np.linspace(min_mel, max_mel, n_mels) + + return self.mel_to_hz(mels) + + def mel(self, sr, n_fft, n_mels=128, fmin=0.0, fmax=None, dtype=np.float32): + ''' + Generates mel filterbank + args: + sr : Sampling rate + n_fft : number of FFT components + n_mels : number of Mel bands to generate + fmin : lowest frequency (in Hz) + fmax : highest frequency (in Hz). sr/2.0 if None + dtype : the data type of the output basis. + returns: + mels : Mel transform matrix + ''' + # default Max freq = half of sampling rate + if fmax is None: + fmax = float(sr) / 2 + + # Initialize the weights + n_mels = int(n_mels) + weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype) + + # Center freqs of each FFT bin + fftfreqs = np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True) + + # 'Center freqs' of mel bands - uniformly spaced between limits + mel_f = self.mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax) + + fdiff = np.diff(mel_f) + ramps = np.subtract.outer(mel_f, fftfreqs) + + for i in range(n_mels): + # lower and upper slopes for all bins + lower = -ramps[i] / fdiff[i] + upper = ramps[i + 2] / fdiff[i + 1] + + # .. then intersect them with each other and zero + weights[i] = np.maximum(0, np.minimum(lower, upper)) + + # Using Slaney-style mel which is scaled to be approx constant energy per channel + enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels]) + weights *= enorm[:, np.newaxis] + return weights + + # STFT preperation + def pad_window_center(self, data, size, axis=-1, **kwargs): + ''' + Centers the data and pads. + args: + data : Vector to be padded and centered + size : Length to pad data + axis : Axis along which to pad and center the data + kwargs : arguments passed to np.pad + return : centered and padded data + ''' + kwargs.setdefault("mode", "constant") + n = data.shape[axis] + lpad = int((size - n) // 2) + lengths = [(0, 0)] * data.ndim + lengths[axis] = (lpad, int(size - n - lpad)) + if lpad < 0: + raise Exception( + ("Target size ({:d}) must be at least input size ({:d})").format(size, n) + ) + return np.pad(data, lengths, **kwargs) + + def frame(self, x, frame_length, hop_length): + ''' + Slices a data array into (overlapping) frames. + args: + x : array to frame + frame_length : length of frame + hop_length : Number of steps to advance between frames + return : A framed view of `x` + ''' + if x.shape[-1] < frame_length: + raise Exception( + "Input is too short (n={:d})" + " for frame_length={:d}".format(x.shape[-1], frame_length) + ) + x = np.asfortranarray(x) + n_frames = 1 + (x.shape[-1] - frame_length) // hop_length + strides = np.asarray(x.strides) + new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize + shape = list(x.shape)[:-1] + [frame_length, n_frames] + strides = list(strides) + [hop_length * new_stride] + return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides) + + def dtype_r2c(self, d, default=np.complex64): + ''' + Find the complex numpy dtype corresponding to a real dtype. + args: + d : The real-valued dtype to convert to complex. + default : The default complex target type, if `d` does not match a known dtype + return : The complex dtype + ''' + mapping = { + np.dtype(np.float32): np.complex64, + np.dtype(np.float64): np.complex128, + } + dt = np.dtype(d) + if dt.kind == "c": + return dt + return np.dtype(mapping.get(dt, default)) + + def stft(self, y, n_fft, hop_length=None, win_length=None, fft_window=None, pad_mode='reflect', return_complex=False): + ''' + Short Time Fourier Transform. The STFT represents a signal in the time-frequency + domain by computing discrete Fourier transforms (DFT) over short overlapping windows. + args: + y : input signal + n_fft : length of the windowed signal after padding with zeros. + hop_length : number of audio samples between adjacent STFT columns. + win_length : Each frame of audio is windowed by window of length win_length and + then padded with zeros to match n_fft + fft_window : a vector or array of length `n_fft` having values computed by a + window function + pad_mode : mode while padding the singnal + return_complex : returns array with complex data type if `True` + return : Matrix of short-term Fourier transform coefficients. + ''' + if win_length is None: + win_length = n_fft + if hop_length is None: + hop_length = int(win_length // 4) + if y.ndim!=1: + raise Exception(f'Invalid input shape. Only Mono Channeled audio supported. Input must have shape (Audio,). Got {y.shape}') + + # Pad the window out to n_fft size + fft_window = self.pad_window_center(fft_window, n_fft) + + # Reshape so that the window can be broadcast + fft_window = fft_window.reshape((-1, 1)) + + # Pad the time series so that frames are centered + y = np.pad(y, int(n_fft // 2), mode=pad_mode) + + # Window the time series. + y_frames = self.frame(y, frame_length=n_fft, hop_length=hop_length) + + # Convert data type to complex + dtype = self.dtype_r2c(y.dtype) + + # Pre-allocate the STFT matrix + stft_matrix = np.empty( (int(1 + n_fft // 2), y_frames.shape[-1]), dtype=dtype, order="F") + + stft_matrix = np.fft.rfft( fft_window * y_frames, axis=0) + return stft_matrix if return_complex==True else np.stack((stft_matrix.real,stft_matrix.imag),axis=-1) + +class Decoder: + ''' + Used for decoding the output of jasper model. + ''' + def __init__(self): + labels=[' ','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',"'"] + self.labels_map = {i: label for i,label in enumerate(labels)} + self.blank_id = 28 + + def decode(self,x): + """ + Takes output of Jasper model and performs ctc decoding algorithm to + remove duplicates and special symbol. Returns prediction + """ + x = np.argmax(x,axis=-1) + hypotheses = [] + prediction = x.tolist() + # CTC decoding procedure + decoded_prediction = [] + previous = self.blank_id + for p in prediction: + if (p != previous or previous == self.blank_id) and p != self.blank_id: + decoded_prediction.append(p) + previous = p + hypothesis = ''.join([self.labels_map[c] for c in decoded_prediction]) + hypotheses.append(hypothesis) + return hypotheses + +def predict(features, net, decoder): + ''' + Passes the features through the Jasper model and decodes the output to english transcripts. + args: + features : input features, calculated using FilterbankFeatures class + net : Jasper model dnn.net object + decoder : Decoder object + return : Predicted text + ''' + # This is a workaround https://github.com/opencv/opencv/issues/19091 + # expanding 1 dimentions allows us to pass it to the network + # from python. This should be resolved in the future. + features = np.expand_dims(features,axis=3) + + # make prediction + net.setInput(features) + output = net.forward() + + # decode output to transcript + prediction = decoder.decode(output.squeeze(0)) + return prediction[0] + +if __name__ == '__main__': + + # Computation backends supported by layers + backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV) + # Target Devices for computation + targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16) + + parser = argparse.ArgumentParser(description='This script runs Jasper Speech recognition model', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--input_audio', type=str, required=True, help='Path to input audio file. OR Path to a txt file with relative path to multiple audio files in different lines') + parser.add_argument('--show_spectrogram', action='store_true', help='Whether to show a spectrogram of the input audio.') + parser.add_argument('--model', type=str, default='jasper.onnx', help='Path to the onnx file of Jasper. default="jasper.onnx"') + parser.add_argument('--output', type=str, help='Path to file where recognized audio transcript must be saved. Leave this to print on console.') + parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, + help='Select a computation backend: ' + "%d: automatically (by default) " + "%d: OpenVINO Inference Engine " + "%d: OpenCV Implementation " % backends) + parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, + help='Select a target device: ' + "%d: CPU target (by default) " + "%d: OpenCL " + "%d: OpenCL FP16 " % targets) + + args, _ = parser.parse_known_args() + + if args.input_audio and not os.path.isfile(args.input_audio): + raise OSError("Input audio file does not exist") + if not os.path.isfile(args.model): + raise OSError("Jasper model file does not exist") + if args.input_audio.endswith('.txt'): + with open(args.input_audio) as f: + content = f.readlines() + content = [x.strip() for x in content] + audio_file_paths = content + for audio_file_path in audio_file_paths: + if not os.path.isfile(audio_file_path): + raise OSError("Audio file({audio_file_path}) does not exist") + else: + audio_file_paths = [args.input_audio] + audio_file_paths = [os.path.abspath(x) for x in audio_file_paths] + + # Read audio Files + features = [] + try: + for audio_file_path in audio_file_paths: + audio = sf.read(audio_file_path) + # If audio is stereo, just take one channel. + X = audio[0] if audio[0].ndim==1 else audio[0][:,0] + features.append(X) + except: + raise Exception(f"Soundfile cannot read {args.input_audio}. Try a different format") + + # Get Filterbank Features + feature_extractor = FilterbankFeatures() + for i in range(len(features)): + X = features[i] + seq_len = np.array([X.shape[0]], dtype=np.int32) + features[i] = feature_extractor.calculate_features(x=X, seq_len=seq_len) + + # Load Network + net = cv.dnn.readNetFromONNX(args.model) + net.setPreferableBackend(args.backend) + net.setPreferableTarget(args.target) + + # Show spectogram if required + if args.show_spectrogram and not args.input_audio.endswith('.txt'): + img = cv.normalize(src=features[0][0], dst=None, alpha=0, beta=255, norm_type=cv.NORM_MINMAX, dtype=cv.CV_8U) + img = cv.applyColorMap(img, cv.COLORMAP_JET) + cv.imshow('spectogram', img) + cv.waitKey(0) + + # Initialize decoder + decoder = Decoder() + + # Make prediction + prediction = [] + print("Predicting...") + for feature in features: + print(f"\rAudio file {len(prediction)+1}/{len(features)}", end='') + prediction.append(predict(feature, net, decoder)) + print("") + + # save transcript if required + if args.output: + with open(args.output,'w') as f: + for pred in prediction: + f.write(pred+'\n') + print("Transcript was written to {}".format(args.output)) + else: + print(prediction) + cv.destroyAllWindows()