% 2003-03-18-singing.diary % Diary of training classifiers to detect singing in music % 2003-03-18 dpwe@ee.columbia.edu % We will be using Netlab for this. Make sure its path is loaded % (see http://www.ncrg.aston.ac.uk/netlab/ to download it) path('/Users/dpwe/matlab/netlab/',path); % (for an introduction to netlab, run the "demnlab" demo front-end) % Train a network for singing detection % First, read in our data dd = []; ll = []; for u = 1:60; d = pfread('trn-music-plp12dd.pf', u-1); dd = [dd;d]; l = ilabread('trn-music.ilab', u-1); ll = [ll;l']; end % Set up vector of options for the optimiser. options = zeros(1,18); options(1) = 1; % This provides display of error values. options(9) = 1; % Check the gradient calculations. options(14) = 100; % Number of training cycles. nhid = 10; nout = 1; alpha = 0.2; % We will use 26 inputs of the 39 i.e. direct and deltas, not doubledeltas net = mlp(26, nhid, nout, 'logistic', alpha); % Net training handled via general optimization routine by Netlab... [net] = netopt(net, options, dd(1:5000,1:26), ll(1:5000), 'quasinew'); % See how it performs on the training data y=mlpfwd(net,dd(:,1:26)); subplot(311) plot(y(1:5000)) subplot(312) plot(ll(1:5000)) % Well, it's somewhat related... % Try it on some distinct test data - a single track da = pfread('aimee-plp12dd.pf',0); ya = mlpfwd(net,da(:,1:26)); % Read in the labels (in Timit format) K = labreadkey('musvox.phset'); [T,L] = labreadlab('aimee.lab', K, 1); T(:,2) = T(:,2)+T(:,1); % Convert the time interval definitions into labels for each 16ms frame la = labsamplabs([1:16578]*0.016,T,L)-1; subplot(311) plot(0.9*la(1:2000)) subplot(312) plot(ya(1:2000)) % If we use a threshold of 0.5 as a decision point, what is % our frame accuracy? sum( (ya>0.5) == la')/length(la) % Try smoothing the output to enforce some continuity yas = conv(hanning(51)/sum(hanning(51)),ya); yas = yas(25+[1:16578]); subplot(313) pp = zeros(1, 99); for t = 1:99; pp(t) = sum( (yas>(t/100)) == la')/length(la); end plot(pp) max(pp) find(pp == max(pp)) % best threshold is 0.09, %corr = 82.7% plot(yas(1:2000)) % looks much smoother, of course % % Now try the task with GMMs instead % % Separate the music and singing examples % Music frames have 0 for the target label ddM = dd(find(ll==0),:); % .. and singing frames have it equal to 1 ddS = dd(find(ll==1),:); % Use 20 mixtures for each GMM, same 26-dim data nmix = 20; ndim = 26; % Initialize the models as diagonal-covariance mixtures gmM = gmm(ndim,nmix,'diag'); gmS = gmm(ndim,nmix,'diag'); % Use k-means to get the initial cluster centers options = foptions; options(14) = 5; % Just use 5 iterations of k-means in initialisation gmM = gmminit(gmM, ddM(:,1:26), options); gmS = gmminit(gmS, ddS(:,1:26), options); % Now do the full EM training to optimize the GMM options = zeros(1, 18); options(1) = 1; % Prints out error values. options(14) = 20; % Number of iterations. [gmM, options, errlog] = gmmem(gmM, ddM(:,1:26), options); % .. for both music (gmM) and singing (gmS) models.. options = zeros(1, 18); options(1) = 1; % Prints out error values. options(14) = 20; % Number of iterations. [gmS, options, errlog] = gmmem(gmS, ddS(:,1:26), options); % Calculate the likelihoods of our test frames under both models lMa = gmmprob(gmM, da(:,1:26)); lSa = gmmprob(gmS, da(:,1:26)); % Decision variable is difference of log likelihoods lra = log(lSa) - log(lMa); % Assuming equal priors, the appropriate threshold is 0 sum( (lra > 0) == la')/length(la) % Raw frame accuracy of these GMMs ~ 66% (guessing is 50%) % Search for best threshold, as upper bound on performance pp=zeros(1,99); for t = 1:99; pp(t) = sum( (lra>(t/10 - 5)) == la')/length(la); end plot(pp) max(pp) find(pp==max(pp)) % Best FA is 76% at thr=-2.5 % Try smoothing the frame-level values to reduce classification noise lras = conv(hanning(61)/sum(hanning(61)),lra); lras = lras(30+[1:16578]); sum( (lras > 0) == la')/length(la) % simple threshold on smoothed is also 76% pp=zeros(1,99); for t = 1:99; pp(t) = sum( (lras>(t/10 - 5)) == la')/length(la); end plot(pp) max(pp) find(pp==max(pp)) % Cheating threshold on smoothed lr is -0.8 giving 85% FA upper bound % Display the raw output subplot(312) tt = [0:1999]*0.016; plot(tt, lra(1:2000)) axis([0 32 -10 10]) % Add the threshold hold on; plot([0 32], [0 0], 'g'); hold off % Plot the smoothed output subplot(313) plot(tt, lras(1:2000)) % Add the optimal threshold hold on; plot([0 32], [-0.8 -0.8], 'g'); hold off axis([0 32 -5 5]) % Add rectangles showing target regions rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(-4.9, 3, 1), repmat(4.9, 3,1)) subplot(312) rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(-4.9, 3, 1), repmat(4.9, 3,1)) % % Compare to using HMM to do the smoothing % % Make a matrix of the log-lhoods from the GMM for 4 states: % entry, singing, no singing, exit llhoods=[zeros(1,16578);log(lMa');log(lSa');zeros(1,16578)]; % Hand-build a transition matrix a = 0.0002; tr = [0 0.5 0.5 0; 0 (1-2*a) a a; 0 a (1-2*a) a; 0 0 0 0]; % Run viterbi dynamic programming to find best path [Q,P] = logvitlh(llhoods,tr); plot(tt, Q(1:2000)-2.5, 'r'); sum( (Q-2) == la)/length(la) % setting tx prob to 1/5000 pushes corr to 74.9% - not a big deal % What happens if we make tx prob 10x smaller? a = 0.00002; tr = [0 0.5 0.5 0; 0 (1-2*a) a a; 0 a (1-2*a) a; 0 0 0 0]; [Q2,P2] = logvitlh(llhoods,tr); hold on; plot(tt, Q2(1:2000)-1.5, 'r'); hold off sum( (Q2-2) == la)/length(la) % Up to 1/50,000 deletes some of the spots, but agreement down to 74.4% sum( (Q2-2) == (Q-2))/length(la) % 95% the same as the original in frames % Compare visually to best smoothed-probs labels lrasQmp8 = (lras > -0.8); plot(tt, lra(1:2000), tt, lras(1:2000),'r') axis([0 32 -10 10]) hold on; plot([0 32], [-0.8 -0.8], 'g'); hold off subplot(413) plot(tt, Q(1:2000), 'c', tt, lrasQmp8(1:2000)+.5,'g') axis([0 32 0 3.6]) rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(0.1, 3, 1), repmat(3.5, 3,1)) % end