# Hair and eye color example ####
## Create the data table
N <- matrix(c(326, 688, 343, 98,
38, 116, 84, 48,
241,584, 909, 403,
110, 188, 412, 681,
3, 4, 26, 85),
ncol=4,
byrow=TRUE)
colnames(N) <- c("Blue", "Light", "Medium", "Dark")
rownames(N) <- c("Fair", "Red", "Medium", "Dark", "Black")
N
x <- chisq.test(N)
# The mean square contingency coefficient
x$statistic /
sum(N)
## CA step by step ####
## Step 0. Create our data table, from Le Roux & Rouanet (2004:52)
N <- matrix(c(326, 688, 343, 98,
38, 116, 84, 48,
241,584, 909, 403,
110, 188, 412, 681,
3, 4, 26, 85),
ncol=4,
byrow=TRUE)
colnames(N) <- c("Blue", "Light", "Medium", "Dark")
rownames(N) <- c("Fair", "Red", "Medium", "Dark", "Black")
# Look at the matrix
N
## Optional: Save the data to a txt-file:
write.table(
x = N,
file = "../data/hair_eyecolor.txt"
)
# Read the data from a txt-file:
N <- as.matrix(
read.table(
file = "../data/hair_eyecolor.txt"
)
)
N
## Step 0 and a half. Run CA from FactoMineR
## (this is cheating...)
FactoMineR::CA(N)
## Step 1. Calculate the total sum, which should add up to 5387 (ibid.:52)
n = sum(N)
n
### Step 2. Thanks to our previous calculation, it is now easy to
### calculate a table of proportions by dividing all cells by the total sum $n$.
P = N / n
P
## Step 3. Thanks to our previous... we can now calculate a) column masses
column.masses = colSums(P)
column.masses
## and b) row masses,
row.masses = rowSums(P)
row.masses
## Step 4. Calculate expected values, given the marginal distributions
## (column and row masses).
E = row.masses %o% column.masses # outer product of...
E
## Basically, if there was no association between hair and eye color, we would
## get the values expected from the distribution of each variable,
## i.e., hair color and eye color respectively.
## Step 5. How much do the observed values differ from the expected values? Let us find out.
R = P - E
R
## We can definitely see some differences, but they are tiny. Still points
## to some association between rows and columns.
## Step 6. Calculate indexed residuals, because small differences in small
## cells might be large (and large differences in large cells may be small).
## We divide residuals by their expected values.
I = R / E
I
## Now we are talking!
## Step 7. Prepare for the magic of CA: calculate the standardized residuals
## (a way of mitigating outliers, for example).
Z = I * sqrt(E)
Z
## Step 8. The magic of CA! Everything before this step, and everything
## after, is highly intuitive and based on simple arithmetic. Now we enter
## the realm of linear algebra and perform a singular value decomposition.
## (This means we try to find uncorrelated variables from correlated ones, or
## axes if you will.)
## And no, I won't do it "by hand" this time, as I will call the R function `svd()`.
SVD = svd(Z)
rownames(SVD$u) = rownames(P)
rownames(SVD$v) = colnames(P)
SVD
## This is, in some sense, where we get coordinates from. We will do
## some additional operations to get our final coordinates, but these are
## the numbers that we scale or standardize to get coordinates to project
## in our graphs.
## Let us first get some eigenvalues too, though.
# Eigenvalues
eigenvalues = SVD$d^2
eigenvalues
## Do they match the ones in Le Roux & Rouanet 2004, p 53?
sum(eigenvalues)
## Step 9. Time for coordinates!
# Standard coordinates
standard.coordinates.rows = sweep(SVD$u, 1, sqrt(row.masses), "/")
standard.coordinates.columns = sweep(SVD$v, 1, sqrt(column.masses), "/")
# Principal coordinates
principal.coordinates.rows = sweep(standard.coordinates.rows, 2, SVD$d, "*")
principal.coordinates.columns = sweep(standard.coordinates.columns, 2, SVD$d, "*")
## Do they match the ones in Le Roux & Rouanet 2004, p 53?
principal.coordinates.rows
plot(as.data.frame(principal.coordinates.columns[,1:2]), col='red', pch=16, xlim=c(1.25, -1.25), ylim=c(-0.4, 0.4))
points(as.data.frame(principal.coordinates.rows[,1:2]), col='blue', pch=16)
## And now we compare to the FactoMineR CA again:
FactoMineR::CA(N)
## Voila!
## Shout-out to Tim Bock (CA step by step) and Aaron Schlegel (SVD step by step).
## SVD step by step
## The SVD may be thought of as a technique of obtaining uncorrelated variables from correlated ones (finding the axes!).
# SVD step by step
# Thanks https://rpubs.com/aaronsc32/singular-value-decomposition-r
ATA <- t(Z) %*% Z
ATA.e <- eigen(ATA)
v.mat <- ATA.e$vectors
v.mat
# v.mat[,1:2] <- v.mat[,1:2] * -1
# v.mat
AAT <- Z %*% t(Z)
AAT
AAT.e <- eigen(AAT)
u.mat <- AAT.e$vectors
u.mat
u.mat <- u.mat[,1:3]
r <- sqrt(ATA.e$values)
r <- r * diag(length(r))[,1:4]
r
# And square r
r^2