library(quanteda)
library(quanteda.corpora)

library(gridExtra)

library(factoextra)
library(ggplot2)
library(ggdendro)

set.seed(042)

1 Clustering texts

1.1 Text similarity refresher

(This section is from Pablo Barbera’s LSE course materials. For links see the workshop repo description)

docs <- c("this is document one", "this is document two")
(doc_dfm <- dfm(docs))
#> Document-feature matrix of: 2 documents, 5 features (20.0% sparse).
#>        features
#> docs    this is document one two
#>   text1    1  1        1   1   0
#>   text2    1  1        1   0   1

Calculate distance and similarity with the use of the Eucledian distance and cosine similarity

Euclidean distrance:

textstat_dist(doc_dfm, method = "euclidean")
#> textstat_dist object; method = "euclidean"
#>       text1 text2
#> text1     0  1.41
#> text2  1.41     0

Remember: \(d_2(X_i, X_j)=(\sum(X_{i,k}-X_{j,k})^2)^{1/2}\)

The distances we want to compute

(d1 <- as.numeric(doc_dfm[1,]))
#> [1] 1 1 1 1 0

(d2 <- as.numeric(doc_dfm[2,]))
#> [1] 1 1 1 0 1

Doing it by hand

sqrt(sum((d1 - d2)^2))
#> [1] 1.414214

Important: Euclidean distance measures the distance of the documents. If we want to express similarity with this we need to formulate it as (1-distance) = similarity.

Cosine similarity

textstat_simil(doc_dfm, method="cosine")
#> textstat_simil object; method = "cosine"
#>       text1 text2
#> text1  1.00  0.75
#> text2  0.75  1.00

Cosine similarity is computed as: \(\frac{\sum A*B}{\sqrt{\sum A^2} * \sqrt{\sum B^2}}\)

sum(d1 * d2) / (sqrt(sum(d1^2)) *  sqrt(sum(d2^2)))
#> [1] 0.75

1.2 Clustering texts

We use the US presidents’ State of the Union speeches to try and cluster them with K-means and hierarchical clustering.

sotu <- corpus_subset(data_corpus_sotu, Date >= "1990-01-01")

summary(sotu, 5)
#> Corpus consisting of 31 documents, showing 5 documents:
#> 
#>          Text Types Tokens Sentences  FirstName President       Date delivery
#>     Bush-1990  1206   4312       198     George      Bush 1990-01-31   spoken
#>     Bush-1991  1297   4468       226     George      Bush 1991-01-29   spoken
#>     Bush-1992  1493   5750       320     George      Bush 1992-01-28   spoken
#>  Clinton-1993  1561   7681       292 William J.   Clinton 1993-02-17   spoken
#>  Clinton-1994  1762   8267       399 William J.   Clinton 1994-01-25   spoken
#>   type      party
#>   SOTU Republican
#>   SOTU Republican
#>   SOTU Republican
#>  other Democratic
#>   SOTU Democratic

ndoc(sotu)
#> [1] 31

Let’s do the usual steps and create a dfm.

sotu_dfm <- dfm(sotu, remove_punct = TRUE, remove_numbers = TRUE, 
                stem = TRUE, remove = stopwords("english")) %>% 
    dfm_trim(min_docfreq = 3) %>% 
    dfm_weight("prop")

sotu_dfm
#> Document-feature matrix of: 31 documents, 2,878 features (64.5% sparse) and 6 docvars.
#>               features
#> docs                     mr      presid      speaker       member         unit
#>   Bush-1990    0.0010989011 0.003296703 0.0005494505 0.0021978022 0.0021978022
#>   Bush-1991    0.0010427529 0.001042753 0.0005213764 0.0005213764 0.0062565172
#>   Bush-1992    0.0008470987 0.003388395 0.0008470987 0.0016941974 0.0008470987
#>   Clinton-1993 0.0006086427 0.002738892 0.0003043214 0.0012172855 0.0015216068
#>   Clinton-1994 0.0008351893 0.004175947 0.0002783964 0.0011135857 0.0011135857
#>   Clinton-1995 0.0011958862 0.003109304 0.0009567089 0.0021525951 0.0011958862
#>               features
#> docs                 state    congress       return       former        senat
#>   Bush-1990    0.006593407 0.002747253 0.0005494505 0.0010989011 0.0005494505
#>   Bush-1991    0.006777894 0.002085506 0.0015641293 0            0           
#>   Bush-1992    0.003811944 0.005929691 0.0004235493 0.0008470987 0           
#>   Clinton-1993 0.003043214 0.003347535 0            0.0006086427 0.0006086427
#>   Clinton-1994 0.004732739 0.006681514 0.0002783964 0.0011135857 0           
#>   Clinton-1995 0.003348481 0.004783545 0.0004783545 0.0002391772 0.0004783545
#> [ reached max_ndoc ... 25 more documents, reached max_nfeat ... 2,868 more features ]

The K-means clustering happens with the kmeans function which is built in the base R toolkit. We try first with centers = 2 to see if we can get a republican democrat divide.

sotu_k <- kmeans(sotu_dfm, centers = 2)

table(sotu_k$cluster)
#> 
#>  1  2 
#> 13 18

Seems OK.

head(docvars(sotu)$President[sotu_k$cluster == 1])
#> [1] "Bush" "Bush" "Bush" "Bush" "Bush" "Bush"

head(docvars(sotu)$President[sotu_k$cluster == 2])
#> [1] "Bush"    "Clinton" "Clinton" "Clinton" "Clinton" "Clinton"

We can investigate the feature level as well.

head(textstat_keyness(sotu_dfm, target = sotu_k$cluster == 1), n = 15)
#>      feature       chi2         p   n_target  n_reference
#> 1       iraq 0.03417017 0.8533453 0.03989207 0.0077798240
#> 2    freedom 0.03361911 0.8545194 0.05164846 0.0156170578
#> 3  terrorist 0.03159007 0.8589303 0.04446892 0.0119436982
#> 4      iraqi 0.03004887 0.8623794 0.02528050 0.0015851224
#> 5     terror 0.02556479 0.8729678 0.02937844 0.0055403555
#> 6      regim 0.02309526 0.8792096 0.01933686 0.0011728266
#> 7      great 0.02064668 0.8857456 0.05494865 0.0275759568
#> 8      enemi 0.01834473 0.8922619 0.02223781 0.0046625407
#> 9       unit 0.01659551 0.8974973 0.05210149 0.0290219730
#> 10    border 0.01402882 0.9057164 0.02202951 0.0068348279
#> 11      free 0.01382954 0.9063853 0.03150149 0.0140918030
#> 12    saddam 0.01369552 0.9068380 0.01280039 0.0013477497
#> 13    danger 0.01317206 0.9086277 0.02110657 0.0067105024
#> 14  homeland 0.01311258 0.9088334 0.01015418 0.0002909514
#> 15     qaida 0.01256857 0.9107365 0.01271956 0.0017480131

head(textstat_keyness(sotu_dfm, target = sotu_k$cluster == 2), n = 15)
#>      feature        chi2         p   n_target n_reference
#> 1       work 0.020162196 0.8870850 0.17812974 0.068911160
#> 2     colleg 0.017403544 0.8950456 0.03961805 0.004959337
#> 3       busi 0.017033605 0.8961607 0.06828977 0.016977315
#> 4        job 0.014934051 0.9027367 0.11984359 0.044556480
#> 5      thing 0.014137675 0.9053530 0.04489232 0.008987512
#> 6        say 0.013929780 0.9060483 0.05295849 0.012594455
#> 7       make 0.013756928 0.9066303 0.13572055 0.054662105
#> 8        get 0.013643649 0.9070138 0.07697173 0.024085925
#> 9        pay 0.012852379 0.9097385 0.04768393 0.011101193
#> 10    welfar 0.012650772 0.9104463 0.02611588 0.002655568
#> 11       cut 0.011887127 0.9131802 0.05860493 0.016897110
#> 12   deficit 0.011854509 0.9132989 0.03529355 0.006570513
#> 13      year 0.011125458 0.9159971 0.21623118 0.105581047
#> 14      educ 0.010143043 0.9197785 0.04884426 0.013864983
#> 15 communiti 0.009797495 0.9211523 0.05987395 0.019564031

1.2.1 How many K?

How many K do we need? We can choose between three methods, the elbow, silhouette and gap statistics. For more on them, see this practical guide: https://uc-r.github.io/kmeans_clustering.

These methods have been implemented for the factoextra package so they are easy to visualize.


fviz_nbclust(as.matrix(sotu_dfm), kmeans, method = "wss")

Visual representation of our k-means

fviz_cluster(sotu_k, data = sotu_dfm)

Let’s compare the evolution with different K-s.

k2 <- kmeans(sotu_dfm, centers = 2)
k3 <- kmeans(sotu_dfm, centers = 3)
k4 <- kmeans(sotu_dfm, centers = 4)
k5 <- kmeans(sotu_dfm, centers = 5)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = sotu_dfm) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = sotu_dfm) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = sotu_dfm) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = sotu_dfm) + ggtitle("k = 5")


gridExtra::grid.arrange(p1, p2, p3, p4, nrow = 2)

1.3 Hierarchical clustering

The process of hierarchical clustering rests on computing some sort of distance measure and then start our bottom-up aggregation process. IMPORTANT, use some sort of normalization (we did with the dfm_weight in the beggining) for the Euclidean distance computation!

sotu_dist_mat <- as.dist(textstat_dist(sotu_dfm, method = "euclidean"))

The clustering algorithm is provided by the base R function hclust

sotu_hclust <- hclust(sotu_dist_mat, method = "complete")

A visual representation with base R

plot(sotu_hclust)

A nicer plot can be made with the ggdendro package

ggdendrogram(sotu_hclust, rotate = FALSE)

Clustering texts

K-means and Hierarchical clustering

2020 November

1 Clustering texts

1.1 Text similarity refresher

1.2 Clustering texts

1.2.1 How many K?

1.3 Hierarchical clustering