PCA biplot - Matlab

Question

Based on to answers Unable to create an array from a table, Coloring The Dots in biPlot Chart I wrote the code below. How can I change the legends to show the clusters = Tb.class (iris species) and how can I convex hull each group?

Code:

clc;
clear;
close all;

Tb = webread('https://datahub.io/machine-learning/iris/r/iris.csv');
clusters = Tb.class;

X = [Tb.sepallength Tb.sepalwidth   Tb.petallength  Tb.petalwidth ];

Z = zscore(X); % Standardized data
[coefs,score] = pca(Z);

vbls = {'sepallength','sepalwidth','petallength','petalwidth'}; 
h=biplot(coefs(:,1:2),'Scores',score(:,1:2),'VarLabels',vbls);

hID = get(h, 'tag'); 
% Isolate handles to scatter points
hPt = h(strcmp(hID,'obsmarker')); 
% Identify cluster groups
grp = findgroups(clusters);    %r2015b or later - leave comment if you need an alternative
grp(isnan(grp)) = max(grp(~isnan(grp)))+1; 
grpID = 1:max(grp); 
% assign colors and legend display name
clrMap = lines(length(unique(grp)));   % using 'lines' colormap
for i = 1:max(grp)
    set(hPt(grp==i), 'Color', clrMap(i,:), 'DisplayName', sprintf('Cluster %d', grpID(i)))
end
% add legend to identify cluster
[~, unqIdx] = unique(grp);
legend(hPt(unqIdx))

mastropi · Answer 1 · 2020-03-23T15:10:10.523

1) For the legend you can simply use the cluster names as follows:

unqClusters = unique(clusters);
for i = 1:max(grp)
    % Original legend showing "Cluster n"
    %set(hPt(grp==i), 'Color', clrMap(i,:), 'DisplayName', sprintf('Cluster %d', grpID(i)))
    % New legend showing the flower class name
    set(hPt(grp==i), 'Color', clrMap(i,:), 'DisplayName', sprintf('%s', unqClusters{i}))
end

2) For the convex hull part, you could use the contours of the 2D normal distribution.

Here is a code that works on the first two raw PC scores, that you would need to adapt to the standardization performed by the biplot() function.

The code essentially goes over each cluster, computes the mean and covariance of the first two PC values, and uses these values as the parameters of the 2D normal distribution whose PDF is computed on a meshgrid. Finally a level value of the 2D PDF is selected for plotting, which results in an ellipse covering most of the points in the cluster.

%% Convex hulls around each cluster (ellipses based on 2D normal distribution)
% Execution parameters:
% Number of points used for the meshgrid to generate the Normal PDF
resolution = 100;
% Factor of PC range by which to extend the min and max PC values
% on which the normal PDF is computed
% (adjust this value so that the ellipse around the points is drawn completely)
factor = 2;

% Variables to store the location and scale parameters of each cluster
mu = cell(1,3);
Sigma = cell(1,3);

% Do the calculations for each cluster and plot points and ellipses on a single plot
figure
hold on
for g = grpID
    % Score (PC) values to analyze
    scoregrp = score(grp==g,1:2);
    mu{g} = mean(scoregrp);
    Sigma{g} = cov(scoregrp);

    % PC1 and PC2 values on which to compute the Normal PDF
    pcvalues = cell(1,2);
    for c = 1:length(pcvalues)
        pcrange = range(scoregrp(:,c));
        pcmin = min(scoregrp(:,c)) - factor*pcrange;
        pcmax = max(scoregrp(:,c)) + factor*pcrange;
        pcstep = pcrange / resolution;
        pcvalues{c} = pcmin:pcstep:pcmax;
    end

    % Meshgrid generated from the above PC values
    [XPoints, YPoints] = meshgrid(pcvalues{1}, pcvalues{2});
    XYPoints = [XPoints(:) YPoints(:)];
    npoints = size(XYPoints, 1);

    % Variables used repeatedly in the computation of the normal PDF
    detSigma = det(Sigma{g});
    invSigma = inv(Sigma{g});

    % Normal PDF on the meshgrid
    % Note: the PDF is computed via a loop instead of matrix operation
    % due to memory constraints
    pdfnormal = nan(1, npoints);
    den = sqrt(2*pi*detSigma);
    for i = 1:npoints
        point = XYPoints(i,:) - mu{g};
        pdfnormal(i) = exp( -0.5 * point * invSigma * point' ) / den;
    end

    % Reshape the PDF to 2D matrix so that we can plot the contours
    pdfnormal2D = reshape(pdfnormal, length(pcvalues{1}), length(pcvalues{2}));

    % Value to plot as ellipse where the normal falls at about the value
    % of the standard normal at two standard deviations
    pdfvalue2Std = pdf('norm', 2);

    % Plot!
    plot(scoregrp(:,1), scoregrp(:,2), '.', 'Color', clrMap(g,:))
    contour(XPoints, YPoints, pdfnormal2D, pdfvalue2Std, 'LineWidth', 2, ...
            'Color', clrMap(g,:))
end
xlabel('PC1')
ylabel('PC2')

which generates the following graph:

PCA biplot - Matlab

1 Answers1