语音识别

声音(.wav)->文本(字符串)

文本(字符串)->声音

音频识别

声音的时域和频域表示

时域：位移=f(时间)

频域：(振幅, 相位)=f(频率)

import numpy as np
import numpy.fft as nf
import scipy.io.wavfile as wf
import matplotlib.pyplot as mp
sample_rate, sigs = wf.read('../data/freq.wav')
sigs = sigs / 2 ** 15
times = np.arange(len(sigs)) / sample_rate
freqs = nf.fftfreq(sigs.size, 1 / sample_rate)
ffts = nf.fft(sigs)
pows = np.abs(ffts)
mp.figure('Time Domain', facecolor='lightgray')
mp.title('Time Domain', fontsize=20)
mp.xlabel('Time', fontsize=14)
mp.ylabel('Signal', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(times, sigs, c='dodgerblue',
	label='Signal=f(Time)')
mp.legend()
mp.figure('Frequency Domain', facecolor='lightgray')
mp.title('Frequency Domain', fontsize=20)
mp.xlabel('Frequency', fontsize=14)
mp.ylabel('Power', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(freqs[freqs>=0], pows[freqs>=0],
	c='orangered', label='Power=F(Frequency)')
mp.legend()
mp.show()

梅尔频率倒谱系数(MFCC)矩阵

将一段音频样本划分成若干片段，其中每一个片段对应MFCC矩阵中的一行，构成一个子样本。将每个子样本做傅里叶变换得到频率谱线，从中选择与音频内容关系最为紧密的13个特征频率，形成一个特征向量。将多个子样本的特征向量组合成矩阵，即MFCC矩阵。

import numpy as np
import scipy.io.wavfile as wf
import python_speech_features as sf
import matplotlib.pyplot as mp
sample_rate, sigs = wf.read(
	'D:/ML/data/speeches/training/banana/banana01.wav')
mfcc = sf.mfcc(sigs, sample_rate)
mp.matshow(mfcc.T, cmap='gist_rainbow',
	fignum='MFCC')
mp.title('MFCC', fontsize=20)
mp.xlabel('Sample', fontsize=14)
mp.ylabel('Feature', fontsize=14)
mp.tick_params(which='both', top=False,
	labeltop=False, labelbottom=True,
	labelsize=10)
mp.show()

import os
import warnings
import numpy as np
import scipy.io.wavfile as wf
import python_speech_features as sf
import hmmlearn.hmm as hl
warnings.filterwarnings('ignore',
	category=DeprecationWarning)
np.seterr(all='ignore')
def search_speeches(directory, speeches):
	directory = os.path.normpath(directory)
	if not os.path.isdir(directory):
		raise IOError("The directory '" +
			directory + "' doesn't exist!")
	for entry in os.listdir(directory):
		label = directory[directory.rfind(
			os.path.sep) + 1:]
		path = os.path.join(directory, entry)
		if os.path.isdir(path):
			search_speeches(path, speeches)
		elif os.path.isfile(path) and \\
			path.endswith('.wav'):
			if label not in speeches:
				speeches[label] = []
			speeches[label].append(path)
train_speeches = {}
search_speeches('../data/speeches/training',
	train_speeches)
train_x, train_y = [], []
for label, filenames in train_speeches.items():
	mfccs = np.array([])
	for filename in filenames:
		sample_rate, sigs = wf.read(filename)
		mfcc = sf.mfcc(sigs, sample_rate)
		if len(mfccs) == 0:
			mfccs = mfcc
		else:
			mfccs = np.append(
				mfccs, mfcc, axis=0)
	train_x.append(mfccs)
	train_y.append(label)
modles = {}
for mfccs, label in zip(train_x, train_y):
	model = hl.GaussianHMM(
		n_components=4, covariance_type='diag',
		n_iter=1000)
	modles[label] = model.fit(mfccs)
test_speeches = {}
search_speeches('../data/speeches/testing',
	test_speeches)
test_x, test_y = [], []
for label, filenames in test_speeches.items():
	mfccs = np.array([])
	for filename in filenames:
		sample_rate, sigs = wf.read(filename)
		mfcc = sf.mfcc(sigs, sample_rate)
		if len(mfccs) == 0:
			mfccs = mfcc
		else:
			mfccs = np.append(mfccs, mfcc, axis=0)
	test_x.append(mfccs)
	test_y.append(label)
pred_test_y = []
for mfccs in test_x:
	best_score, best_label = None, None
	for label, model in modles.items():
		score = model.score(mfccs)
		if (best_score is None) or \\
			(best_score < score):
			best_score, best_label = \\
				score, lab...

声音合成

根据需求获取某个声音的模型频域数据，根据业务需要可以修改模型数据，逆向生成时域数据，完成声音的合成。

案例：

import json
import numpy as np
import scipy.io.wavfile as wf
with open('../data/12.json', 'r') as f:
    freqs = json.loads(f.read())
tones = [
    ('G5', 1.5),
    ('A5', 0.5),
    ('G5', 1.5),
    ('E5', 0.5),
    ('D5', 0.5),
    ('E5', 0.25),
    ('D5', 0.25),
    ('C5', 0.5),
    ('A4', 0.5),
    ('C5', 0.75)]
sample_rate = 44100
music = np.empty(shape=1)
for tone, duration in tones:
    times = np.linspace(0, duration, duration * sample_rate)
    sound = np.sin(2 * np.pi * freqs[tone] * times)
    music = np.append(music, sound)
music *= 2 ** 15
music = music.astype(np.int16)
wf.write('../data/music.wav', sample_rate, music)