# Hair and eye color example #### ## Create the data table N <- matrix(c(326, 688, 343, 98, 38, 116, 84, 48, 241,584, 909, 403, 110, 188, 412, 681, 3, 4, 26, 85), ncol=4, byrow=TRUE) colnames(N) <- c("Blue", "Light", "Medium", "Dark") rownames(N) <- c("Fair", "Red", "Medium", "Dark", "Black") N x <- chisq.test(N) # The mean square contingency coefficient x$statistic / sum(N) ## CA step by step #### ## Step 0. Create our data table, from Le Roux & Rouanet (2004:52) N <- matrix(c(326, 688, 343, 98, 38, 116, 84, 48, 241,584, 909, 403, 110, 188, 412, 681, 3, 4, 26, 85), ncol=4, byrow=TRUE) colnames(N) <- c("Blue", "Light", "Medium", "Dark") rownames(N) <- c("Fair", "Red", "Medium", "Dark", "Black") # Look at the matrix N ## Optional: Save the data to a txt-file: write.table( x = N, file = "../data/hair_eyecolor.txt" ) # Read the data from a txt-file: N <- as.matrix( read.table( file = "../data/hair_eyecolor.txt" ) ) N ## Step 0 and a half. Run CA from FactoMineR ## (this is cheating...) FactoMineR::CA(N) ## Step 1. Calculate the total sum, which should add up to 5387 (ibid.:52) n = sum(N) n ### Step 2. Thanks to our previous calculation, it is now easy to ### calculate a table of proportions by dividing all cells by the total sum $n$. P = N / n P ## Step 3. Thanks to our previous... we can now calculate a) column masses column.masses = colSums(P) column.masses ## and b) row masses, row.masses = rowSums(P) row.masses ## Step 4. Calculate expected values, given the marginal distributions ## (column and row masses). E = row.masses %o% column.masses # outer product of... E ## Basically, if there was no association between hair and eye color, we would ## get the values expected from the distribution of each variable, ## i.e., hair color and eye color respectively. ## Step 5. How much do the observed values differ from the expected values? Let us find out. R = P - E R ## We can definitely see some differences, but they are tiny. Still points ## to some association between rows and columns. ## Step 6. Calculate indexed residuals, because small differences in small ## cells might be large (and large differences in large cells may be small). ## We divide residuals by their expected values. I = R / E I ## Now we are talking! ## Step 7. Prepare for the magic of CA: calculate the standardized residuals ## (a way of mitigating outliers, for example). Z = I * sqrt(E) Z ## Step 8. The magic of CA! Everything before this step, and everything ## after, is highly intuitive and based on simple arithmetic. Now we enter ## the realm of linear algebra and perform a singular value decomposition. ## (This means we try to find uncorrelated variables from correlated ones, or ## axes if you will.) ## And no, I won't do it "by hand" this time, as I will call the R function `svd()`. SVD = svd(Z) rownames(SVD$u) = rownames(P) rownames(SVD$v) = colnames(P) SVD ## This is, in some sense, where we get coordinates from. We will do ## some additional operations to get our final coordinates, but these are ## the numbers that we scale or standardize to get coordinates to project ## in our graphs. ## Let us first get some eigenvalues too, though. # Eigenvalues eigenvalues = SVD$d^2 eigenvalues ## Do they match the ones in Le Roux & Rouanet 2004, p 53? sum(eigenvalues) ## Step 9. Time for coordinates! # Standard coordinates standard.coordinates.rows = sweep(SVD$u, 1, sqrt(row.masses), "/") standard.coordinates.columns = sweep(SVD$v, 1, sqrt(column.masses), "/") # Principal coordinates principal.coordinates.rows = sweep(standard.coordinates.rows, 2, SVD$d, "*") principal.coordinates.columns = sweep(standard.coordinates.columns, 2, SVD$d, "*") ## Do they match the ones in Le Roux & Rouanet 2004, p 53? principal.coordinates.rows plot(as.data.frame(principal.coordinates.columns[,1:2]), col='red', pch=16, xlim=c(1.25, -1.25), ylim=c(-0.4, 0.4)) points(as.data.frame(principal.coordinates.rows[,1:2]), col='blue', pch=16) ## And now we compare to the FactoMineR CA again: FactoMineR::CA(N) ## Voila! ## Shout-out to Tim Bock (CA step by step) and Aaron Schlegel (SVD step by step). ## SVD step by step ## The SVD may be thought of as a technique of obtaining uncorrelated variables from correlated ones (finding the axes!). # SVD step by step # Thanks https://rpubs.com/aaronsc32/singular-value-decomposition-r ATA <- t(Z) %*% Z ATA.e <- eigen(ATA) v.mat <- ATA.e$vectors v.mat # v.mat[,1:2] <- v.mat[,1:2] * -1 # v.mat AAT <- Z %*% t(Z) AAT AAT.e <- eigen(AAT) u.mat <- AAT.e$vectors u.mat u.mat <- u.mat[,1:3] r <- sqrt(ATA.e$values) r <- r * diag(length(r))[,1:4] r # And square r r^2