##
## Using R as a toy laboratory for DSM research
##
##
## 1) Setup (read data files)
##
# read.table() is the generic R function for reading tabular data in various text formats
# (such as CSV and TAB-delimited text). The data are loaded more efficiently if we tell
# R that all values are strings; we also need to make sure that R doesn't misinterpret a
# stray " character as the start of a (multi-line) quoted string. Setting column names
# makes it easier to access the data. Note that R can uncompress files on the fly; here
# we use the gzfile() function for GZip-compressed data. If your browser has automatically
# uncompressed the file after download, you can simply omit the gzfile() call. If you
# experience any problems with the gzfile() function (we heard such reports in R for Windows),
# try to uncompress the file outside R, and then load the resulting text file without gzfile().
tokens <- read.table(gzfile("bnc_vobj_filtered.txt.gz"), colClasses="character", quote="", col.names=c("verb", "noun"))
# The variable \texttt{tokens} now holds co-occurrence tokens as a table
# (in R lingo, such tables are called data.frame)
# Size of the table (rows, columns) and first 10 rows
dim(tokens)
head(tokens, 10)
# Small illustration matrix for selected nouns and verbs
selected.nouns <- c("knife","cat","dog","boat","cup","pig")
selected.verbs <- c("get","see","use","hear","eat","kill")
##
## 2) Build co-occurrence matrix
##
# %in% operator tests whether value is contained in list;
# not the single & for logical "and" (vector operation)
tokens <- subset(tokens, verb %in% selected.verbs & noun %in% selected.nouns)
# How many co-occurrence tokens are left?
dim(tokens)
head(tokens, 10)
# Contstruct matrix of co-occurrence counts (contingency table)
M <- table(tokens$noun, tokens$verb)
M
# Use subscripts to extract row and column vectors
M["cat", ]
M[, "use"]
# For the calculating association scores, we need the marginal frequencies
# of the nouns and verbs; for simplicity, we obtain them by summing over the
# rows and columns of the table (this is not mathematically correct!)
f.nouns <- rowSums(M)
f.verbs <- colSums(M)
N <- sum(M) # sample size (sum over all cells of the table)
# Calculate expected frequencies from row and column marginals
E <- f.nouns %*% t(f.verbs) / N
round(E, 1)
# Observed frequencies are simply the entries of M
O <- M
##
## 3) Feature scaling
##
# Because of Zipf's law, frequency distributions are highly skewed;
# DSM matrix M will be dominated by high-frequency entries
# Solution 1: transform into log-frequencies
M1 <- log10(M + 1) # discounted (+1) to avoid log(0)
round(M1, 2)
# Solution 2: association scores, e.g. t-score
M2 <- (O - E) / sqrt(O + 1) # discounted to avoid division by 0
round(M2, 2)
# "Sparse" association measures set all negative associations to 0;
# can be done with ifelse(), a vectorised if statement
M3 <- ifelse(O >= E, (O - E) / sqrt(O), 0)
round(M3, 2)
# Pick your favourite scaling method here!
M <- M2
# A simple visualisation: plot nouns in two selected dimensions
M.2d <- M[, c("get", "use")]
round(M.2d, 2)
# Two-column matrix automatically interpreted as x- and y-coordinates
plot(M.2d, pch=20, col="red", main="DSM visualisation")
# Add labels: the text strings are the rownames of M
text(M.2d, labels=rownames(M.2d), pos=3)
#-- internal use: plot for lecture slides
par(cex=1.2, mar=c(4,4,2,1)+.1)
plot(M.2d, pch=19, col="red", main="DSM visualisation", xlim=c(-5,5), ylim=c(-10,10))
text(M.2d, labels=rownames(M.2d), pos=3)
dev.copy2pdf(file="dsm_lab_2d_plot.pdf", onefile=FALSE, bg="white")
##
## 4) Norms and distances
##
# Intuitive length of vector: Euclidean norm
# (NB: R function definitions look almost like the mathematical definitions)
euclid.norm <- function (x) sqrt(sum(x * x))
# Euclidean distance between x and y = norm of (x - y)
euclid.dist <- function (x, y) euclid.norm(x - y)
# Note: We will discuss alternative norms and distances on Thursday!
# Compute lengths (norms) of all row vectors
row.norms <- apply(M, 1, euclid.norm) # 1 = rows, 2 = columns
round(row.norms, 2)
# Normalisation: divide each row by its norm; this is a rescaling of the
# row "dimensions" and can be done by multiplication with a diagonal matrix
scaling.matrix <- diag(1 / row.norms)
round(scaling.matrix, 3)
M.norm <- scaling.matrix %*% M
round(M.norm, 2)
##
## 5) Nearest neighbours
##
# Matrix multiplication has lost the row labels (copy from M)
rownames(M.norm) <- rownames(M)
# To calculate distances of all terms e.g. from "dog", apply euclid.dist()
# function to rows, supplying the "dog" vector as fixed second argument
v.dog <- M.norm["dog",]
dist.dog <- apply(M.norm, 1, euclid.dist, y=v.dog)
# Now we can sort the vector of distances to find nearest neighbours
sort(dist.dog)
# R has a built-in function to compute a full distance matrix
distances <- dist(M.norm, method="euclidean")
round(distances, 2)
# If you want to search nearest neighbours, convert triangular distance
# matrix to full symmetric matrix and extract distance vectors from rows
dist.matrix <- as.matrix(distances)
sort(dist.matrix["dog",])
##
## 6) Clustering & semantic maps
##
# Distance matrix is also the basis for a cluster analysis
plot(hclust(distances))
# Visualisation as semantic map by projection into 2-dimensional space;
# uses non-linear multidimensional scaling (MDS)
library(MASS)
M.mds <- isoMDS(distances)$points
# Plot works as for the two selected dimensions above
plot(M.mds, pch=20, col="red", main="Semantic map", xlab="Dim 1", ylab="Dim 2")
text(M.mds, labels=rownames(M.mds), pos=3)
#-- internal use: plots for lecture slides
par(cex=1.4, mar=c(4,4,2,1)+.1)
plot(M.mds, pch=19, col="red", main="Semantic map", xlim=c(-1,1), ylim=c(-1,1), xlab="Dim 1", ylab="Dim 2")
text(M.mds, labels=rownames(M.mds), pos=3)
dev.copy2pdf(file="dsm_lab_semantic_map.pdf", onefile=FALSE, bg="white")
plot(hclust(distances), xlab="Euclidean distance")
dev.copy2pdf(file="dsm_lab_cluster.pdf", onefile=FALSE, bg="white")