% 2003-03-18-singing.diary
% Diary of training classifiers to detect singing in music
% 2003-03-18 dpwe@ee.columbia.edu

% We will be using Netlab for this.  Make sure its path is loaded
% (see http://www.ncrg.aston.ac.uk/netlab/ to download it)
path('/Users/dpwe/matlab/netlab/',path);
% (for an introduction to netlab, run the "demnlab" demo front-end)

% Train a network for singing detection

% First, read in our data
dd = []; ll = [];
for u = 1:60; d = pfread('trn-music-plp12dd.pf', u-1); dd = [dd;d]; l = ilabread('trn-music.ilab', u-1); ll = [ll;l']; end

% Set up vector of options for the optimiser.
options = zeros(1,18);
options(1) = 1;                 % This provides display of error values.
options(9) = 1;                 % Check the gradient calculations.
options(14) = 100;              % Number of training cycles.

nhid = 10;
nout = 1;
alpha = 0.2;
% We will use 26 inputs of the 39 i.e. direct and deltas, not doubledeltas
net = mlp(26, nhid, nout, 'logistic', alpha);
% Net training handled via general optimization routine by Netlab...
[net] = netopt(net, options, dd(1:5000,1:26), ll(1:5000), 'quasinew');

% See how it performs on the training data
y=mlpfwd(net,dd(:,1:26));
subplot(311)
plot(y(1:5000))
subplot(312)
plot(ll(1:5000))
% Well, it's somewhat related...

% Try it on some distinct test data - a single track
da = pfread('aimee-plp12dd.pf',0);
ya = mlpfwd(net,da(:,1:26));

% Read in the labels (in Timit format)
K = labreadkey('musvox.phset');
[T,L] = labreadlab('aimee.lab', K, 1);
T(:,2) = T(:,2)+T(:,1);
% Convert the time interval definitions into labels for each 16ms frame
la = labsamplabs([1:16578]*0.016,T,L)-1;
subplot(311)
plot(0.9*la(1:2000))
subplot(312)
plot(ya(1:2000))

% If we use a threshold of 0.5 as a decision point, what is 
% our frame accuracy?
sum( (ya>0.5) == la')/length(la)

% Try smoothing the output to enforce some continuity
yas = conv(hanning(51)/sum(hanning(51)),ya);
yas = yas(25+[1:16578]);
subplot(313)
pp = zeros(1, 99);
for t = 1:99; pp(t) = sum( (yas>(t/100)) == la')/length(la); end
plot(pp)
max(pp)
find(pp == max(pp))
% best threshold is 0.09, %corr = 82.7%
plot(yas(1:2000))
% looks much smoother, of course

%
% Now try the task with GMMs instead
%
% Separate the music and singing examples
% Music frames have 0 for the target label
ddM = dd(find(ll==0),:);
% .. and singing frames have it equal to 1
ddS = dd(find(ll==1),:);

% Use 20 mixtures for each GMM, same 26-dim data
nmix = 20;
ndim = 26;
% Initialize the models as diagonal-covariance mixtures
gmM = gmm(ndim,nmix,'diag');
gmS = gmm(ndim,nmix,'diag');
% Use k-means to get the initial cluster centers
options = foptions;
options(14) = 5;        % Just use 5 iterations of k-means in initialisation
gmM = gmminit(gmM, ddM(:,1:26), options);
gmS = gmminit(gmS, ddS(:,1:26), options);
% Now do the full EM training to optimize the GMM
options = zeros(1, 18);
options(1)  = 1;                % Prints out error values.
options(14) = 20;               % Number of iterations.
[gmM, options, errlog] = gmmem(gmM, ddM(:,1:26), options);
% .. for both music (gmM) and singing (gmS) models..
options = zeros(1, 18);
options(1)  = 1;                % Prints out error values.
options(14) = 20;               % Number of iterations.
[gmS, options, errlog] = gmmem(gmS, ddS(:,1:26), options);

% Calculate the likelihoods of our test frames under both models
lMa = gmmprob(gmM, da(:,1:26));
lSa = gmmprob(gmS, da(:,1:26));
% Decision variable is difference of log likelihoods
lra = log(lSa) - log(lMa);
% Assuming equal priors, the appropriate threshold is 0
sum( (lra > 0) == la')/length(la)
% Raw frame accuracy of these GMMs ~ 66% (guessing is 50%)
% Search for best threshold, as upper bound on performance
pp=zeros(1,99);
for t = 1:99; pp(t) = sum( (lra>(t/10 - 5)) == la')/length(la); end
plot(pp)
max(pp)
find(pp==max(pp))
% Best FA is 76% at thr=-2.5

% Try smoothing the frame-level values to reduce classification noise
lras = conv(hanning(61)/sum(hanning(61)),lra);
lras = lras(30+[1:16578]);
sum( (lras > 0) == la')/length(la)
% simple threshold on smoothed is also 76%
pp=zeros(1,99);
for t = 1:99; pp(t) = sum( (lras>(t/10 - 5)) == la')/length(la); end
plot(pp)
max(pp)
find(pp==max(pp))
% Cheating threshold on smoothed lr is -0.8 giving 85% FA upper bound

% Display the raw output
subplot(312)
tt = [0:1999]*0.016;
plot(tt, lra(1:2000))
axis([0 32 -10 10])
% Add the threshold
hold on; plot([0 32], [0 0], 'g'); hold off

% Plot the smoothed output
subplot(313)
plot(tt, lras(1:2000))
% Add the optimal threshold
hold on; plot([0 32], [-0.8 -0.8], 'g'); hold off
axis([0 32 -5 5])
% Add rectangles showing target regions
rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(-4.9, 3, 1), repmat(4.9, 3,1))
subplot(312)
rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(-4.9, 3, 1), repmat(4.9, 3,1))

%
% Compare to using HMM to do the smoothing
%
% Make a matrix of the log-lhoods from the GMM for 4 states:
% entry, singing, no singing, exit
llhoods=[zeros(1,16578);log(lMa');log(lSa');zeros(1,16578)];
% Hand-build a transition matrix
a = 0.0002;
tr = [0 0.5 0.5 0; 0 (1-2*a) a a; 0 a (1-2*a) a; 0 0 0 0];
% Run viterbi dynamic programming to find best path
[Q,P] = logvitlh(llhoods,tr);
plot(tt, Q(1:2000)-2.5, 'r');
sum( (Q-2) == la)/length(la)
% setting tx prob to 1/5000 pushes corr to 74.9% - not a big deal
% What happens if we make tx prob 10x smaller?
a = 0.00002;
tr = [0 0.5 0.5 0; 0 (1-2*a) a a; 0 a (1-2*a) a; 0 0 0 0];
[Q2,P2] = logvitlh(llhoods,tr);
hold on; plot(tt, Q2(1:2000)-1.5, 'r'); hold off
sum( (Q2-2) == la)/length(la)
% Up to 1/50,000 deletes some of the spots, but agreement down to 74.4%
sum( (Q2-2) == (Q-2))/length(la)
% 95% the same as the original in frames
% Compare visually to best smoothed-probs labels
lrasQmp8 = (lras > -0.8);
plot(tt, lra(1:2000), tt, lras(1:2000),'r')
axis([0 32 -10 10])
hold on; plot([0 32], [-0.8 -0.8], 'g'); hold off
subplot(413)
plot(tt, Q(1:2000), 'c', tt, lrasQmp8(1:2000)+.5,'g')
axis([0 32 0 3.6])
rect(T(2:2:6,1), T(2:2:6,1)+T(2:2:6,2), repmat(0.1, 3, 1), repmat(3.5, 3,1))

% end