% Demos for pattern classification lecture
% 2003-02-10 Dan Ellis dpwe@ee.columbia.edu

% Neural net training
% Load data
load fmtO.txt
load fmtU.txt
load fmtA.txt
% Build 2-dimensional data set [F1, F2]
dat = [fmtO(:,[1 2]);fmtU(:,[1 2]);fmtA(:,[1 2])];
size(dat)

ans =

   150     2

% 50 examples of each
% Build target outputs.  One unit for O, one for U, one for A
oo = ones(50,1); zz = zeros(50,1);
tgt = [oo,zz,zz;zz,oo,zz;zz,zz,oo];
size(tgt)

ans =

   150     2

% Calculate normalization, so input units don't saturate
nrm = [mean(dat);std(dat)];
% Train net, 5 HUs, 50 epochs
[wh,wo,es] = nntrain(dat,nrm,tgt,0.1,50,5);
Iteration=1 MSError =0.88821
Iteration=2 MSError =0.64701
...
Iteration=49 MSError =0.22906
Iteration=50 MSError =0.22845
% Continue training with reduced learning rate
[wh,wo,es] = nntrain(dat,nrm,tgt,0.05,10,wh,wo);
Iteration=1 MSError =0.22299
...
Iteration=10 MSError =0.21944
[wh,wo,es] = nntrain(dat,nrm,tgt,0.025,10,wh,wo);
Iteration=1 MSError =0.21249
...
Iteration=10 MSError =0.21339
% Look at the performance on the training data.  Should flip after 50
nno = nnfwd(dat,nrm,wh,wo);
plot(nno)
% Not bad.  A few patterns are hard, indicated by spikes in response

% Show decision boundary (output X > other outpus) on data scatter
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
% Sample entire 2D surface of NN outputs
[nnO,xx,yy] = nngridsamp(wh,wo,nrm,[200 1100 600 1600],60,1);
[nnU,xx,yy] = nngridsamp(wh,wo,nrm,[200 1100 600 1600],60,2);
[nnA,xx,yy] = nngridsamp(wh,wo,nrm,[200 1100 600 1600],60,3);
% Use contour to plot where their difference crosses zero
hold on
contour(xx,yy,nnO-max(nnU,nnA),[0 0])
contour(xx,yy,nnA-max(nnU,nnO),[0 0])
% Pretty good decision boundary.  You can see the 'wrong' points
% Add the actual surfaces defined by the outputs:
surf(xx,yy,nnO)
surf(xx,yy,nnU)
surf(xx,yy,nnA)
hold off


% Single gaussian example
% Since a Gaussian is defined by its mean and covariance, 
% fitting a single Gaussian to a dataset is simply a matter of 
% calculating the sample mean and covariance, and using the
% Gaussian they define
mu = mean(fmtU(:,[1 2]));
vu = cov(fmtU(:,[1 2]));
mo = mean(fmtO(:,[1 2]));
vo = cov(fmtO(:,[1 2]));
ma = mean(fmtA(:,[1 2]));
va = cov(fmtA(:,[1 2]));
% To plot the 1 sigma 'radius' of the Gaussian, do the inverse 
% mapping from Euclidean space...
circ = [cos([0:20]'/10*pi),sin([0:20]'/10*pi)];
[u,s,v] = svd(inv(vo));
circo = v'*inv(sqrt(s))*circ';
% Plot on top of data scatter
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
hold on
plot(mo(1)+circo(1,:),mo(2)+circo(2,:),'b')
hold off
% Can also do the reverse i.e. map data points to space where Gauss is 
% a unit circle
im = sqrt(s)*v;
mU = (im*fmtU(:,[1 2])')';
mO = (im*fmtO(:,[1 2])')';
mA = (im*fmtA(:,[1 2])')';
mmo = (im*mo')';
plot(mU(:,1),mU(:,2),'.r',mO(:,1),mO(:,2),'.b',mA(:,1),mA(:,2),'.g');
hold on
plot(mmo(1)+circ(:,1),mmo(2)+circ(:,2),'b')
% However, back to normal space
hold off
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
% Easier to use gmmplot (with a single gauss) to get circle
gmmplot(mu,vu,[],[1 2],1,'r')
gmmplot(ma,va,[],[1 2],1,'g')
gmmplot(mo,vo,[],[1 2],1,'b')
% Decision boundary is when Gaussians (scaled by priors) are equal
% Use gmmgridsamp to evaluate over a grid
[ggA,xx,yy] = gmmgridsamp(ma,va,1,[200 1100 600 1600],60);
[ggO,xx,yy] = gmmgridsamp(mo,vo,1,[200 1100 600 1600],60);
[ggU,xx,yy] = gmmgridsamp(mu,vu,1,[200 1100 600 1600],60);
hold on
contour(xx,yy,ggO-max(ggA,ggU),[0 0])
contour(xx,yy,ggA-max(ggO,ggU),[0 0])
contour(xx,yy,ggU-max(ggO,ggA),[0 0])
% Reasonable boundary, not particularly optimal for training data
% Add surfaces
surf(xx,yy,ggU)
surf(xx,yy,ggO)
surf(xx,yy,ggA)

% Note: making covariances uniform and equal would turn this into linear decision boundaries
hold off
vv = [2000,0;0,2000];
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
gmmplot(mu,vv,[],[1 2],1,'r')
gmmplot(mo,vv,[],[1 2],1,'b')
gmmplot(ma,vv,[],[1 2],1,'g')
[ggU,xx,yy] = gmmgridsamp(mu,vv,1,[200 1100 600 1600],60);
[ggO,xx,yy] = gmmgridsamp(mo,vv,1,[200 1100 600 1600],60);
[ggA,xx,yy] = gmmgridsamp(ma,vv,1,[200 1100 600 1600],60);
hold on
contour(xx,yy,ggO-max(ggA,ggU),[0 0])
% Some numerical problems creeping in, but you get the picture


% Gaussian mixture example
% Can train one GMM on whole data set - a kind of unsupervised clustering
hold off;
[gmm,gmv,gmc]=gmmest(dat,3,[],[],50,1);
Iteration=1 Log data likelihood = -1947.0721 delta=8051.9279
Iteration=2 Log data likelihood = -1889.733 delta=57.3391
...
Iteration=49 Log data likelihood = -1850.7996 delta=0.19012
Iteration=50 Log data likelihood = -1850.5923 delta=0.20731
% Not terrible for our task!
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
gmmplot(gmm,gmv);
% But boundary between U and O is pretty chancy - probably diff if we do it again
[gmm,gmv,gmc]=gmmest(dat,3,[],[],50);
Iteration=1 Log data likelihood = -1935.1267 delta=8063.8733
Iteration=2 Log data likelihood = -1907.581 delta=27.5458
...
Iteration=49 Log data likelihood = -1862.8608 delta=0.0036941
Iteration=50 Log data likelihood = -1862.8571 delta=0.0036349
hold off
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
gmmplot(gmm,gmv);
% Yuk!

% For our estimate of the bayesian classifier, train GMMs for each class
[gmmU,gmvU,gmcU]=gmmest(fmtU(:,[1 2]),3,[],[],50);
Iteration=1 Log data likelihood = -569.3872 delta=9429.6128
Iteration=2 Log data likelihood = -552.1006 delta=17.2866
...
Iteration=49 Log data likelihood = -547.4049 delta=0.0011051
Iteration=50 Log data likelihood = -547.4039 delta=0.0010106
[gmmO,gmvO,gmcO]=gmmest(fmtO(:,[1 2]),3,[],[],50);
Iteration=1 Log data likelihood = -570.195 delta=9428.805
Iteration=2 Log data likelihood = -548.5755 delta=21.6195
...
Iteration=49 Log data likelihood = -525.7043 delta=2.9913e-09
Iteration=50 Log data likelihood = -525.7043 delta=1.8032e-09
[gmmA,gmvA,gmcA]=gmmest(fmtA(:,[1 2]),3,[],[],50);
Iteration=1 Log data likelihood = -626.5629 delta=9372.4371
Iteration=2 Log data likelihood = -599.119 delta=27.4439
...
Iteration=49 Log data likelihood = -588.5126 delta=0.34033
Iteration=50 Log data likelihood = -588.341 delta=0.17157
plot(fmtU(:,1),fmtU(:,2),'.r',fmtO(:,1),fmtO(:,2),'.b',fmtA(:,1),fmtA(:,2),'.g');
gmmplot(gmmU,gmvU,[],[1 2],1,'r');
gmmplot(gmmO,gmvO,[],[1 2],1,'b');
gmmplot(gmmA,gmvA,[],[1 2],1,'g');
% Sample surfaces
[gmsU,xx,yy] = gmmgridsamp(gmmU,gmvU,gmcU,[200 1100 600 1600],60);
[gmsO,xx,yy] = gmmgridsamp(gmmO,gmvO,gmcO,[200 1100 600 1600],60);
[gmsA,xx,yy] = gmmgridsamp(gmmA,gmvA,gmcA,[200 1100 600 1600],60);
% Decision boundaries
hold on
contour(xx,yy,gmsO-max(gmsA,gmsU),[0 0])
contour(xx,yy,gmsA-max(gmsO,gmsU),[0 0])
% Freaky!
% Add surfaces
surf(xx,yy,gmsO)
surf(xx,yy,gmsA)
surf(xx,yy,gmsU)
% Note: integral under surfaces is 1 (modulo grid density)
[sum(sum(gmsU)),sum(sum(gmsO)),sum(sum(gmsA))]

ans =

    0.0039    0.0039    0.0038

% dx is 900/60 = 15 and dy = 1000/60 = 16.67 so integral is 1/(15*16.67)
1/(15*16.67)

ans =

    0.0040

% a little under