A new packages to install: devtools and quanteda.dictionaries and quanteda.corpora

install.packages("devtools")
devtools::install_github("kbenoit/quanteda.dictionaries") # press 3, so no package are updated during install
devtools::install_github("quanteda/quanteda.corpora") # press 3, so no package are updated during install
install.packages("tibble")
install.packages("broom")

And the packages that we’ll use.

library(dplyr)
library(lubridate)
library(stringr)
library(ggplot2)
library(quanteda)
library(quanteda.corpora)
library(quanteda.dictionaries)
library(broom)


set.seed(042)

1 dictionaries in quanteda

We can create dictionaries in quanteda with the dictionary() function. They are essentially named lists of words combined into a single object. The function allows to load a file from your computer or create one in R.

We’ll use the Laver-Garry dictionary which is in the /dictionary folder and load it into R. If it is in a specific dictionary format, you can specify it in the format = argument.

lg_dict <- dictionary(file = "dictionary/laver-garry.cat")

The Laver-Garry is a hierarchical dictionary with many subcategories. We can use the summary() and str() function to get a better sense of our dictionary object. The summary only displays the topmost levels.

summary(lg_dict)
#>               Length Class       Mode     
#> CULTURE        3     dictionary2 list     
#> ECONOMY        3     dictionary2 list     
#> ENVIRONMENT    2     dictionary2 list     
#> GROUPS         2     dictionary2 list     
#> INSTITUTIONS   3     dictionary2 list     
#> LAW_AND_ORDER  2     dictionary2 list     
#> RURAL         16     -none-      character
#> URBAN          1     -none-      character
#> VALUES         2     dictionary2 list

With str we can see the whole structure of the object.

str(lg_dict)
#> Formal class 'dictionary2' [package "quanteda"] with 2 slots
#>   ..@ .Data:List of 9
#>   .. ..$ :List of 4
#>   .. .. ..$ CULTURE-HIGH   :List of 1
#>   .. .. .. ..$ : chr [1:8] "art" "artistic" "dance" "galler*" ...
#>   .. .. ..$ CULTURE-POPULAR:List of 1
#>   .. .. .. ..$ : chr "media"
#>   .. .. ..$ SPORT          :List of 1
#>   .. .. .. ..$ : chr "angler*"
#>   .. .. ..$                : chr [1:3] "people" "war_in_iraq" "civil_war"
#>   .. ..$ :List of 3
#>   .. .. ..$ +STATE+:List of 1
#>   .. .. .. ..$ : chr [1:50] "accommodation" "age" "ambulance" "assist" ...
#>   .. .. ..$ =STATE=:List of 1
#>   .. .. .. ..$ : chr [1:71] "accountant" "accounting" "accounts" "advert*" ...
#>   .. .. ..$ -STATE-:List of 1
#>   .. .. .. ..$ : chr [1:62] "assets" "autonomy" "barrier*" "bid" ...
#>   .. ..$ :List of 2
#>   .. .. ..$ CON ENVIRONMENT:List of 1
#>   .. .. .. ..$ : chr "produc*"
#>   .. .. ..$ PRO ENVIRONMENT:List of 1
#>   .. .. .. ..$ : chr [1:28] "car" "catalytic" "chemical*" "chimney*" ...
#>   .. ..$ :List of 2
#>   .. .. ..$ ETHNIC:List of 1
#>   .. .. .. ..$ : chr [1:5] "asian*" "buddhist*" "ethnic*" "race" ...
#>   .. .. ..$ WOMEN :List of 1
#>   .. .. .. ..$ : chr [1:3] "girls" "woman" "women"
#>   .. ..$ :List of 3
#>   .. .. ..$ CONSERVATIVE:List of 1
#>   .. .. .. ..$ : chr [1:11] "authority" "continu*" "disrupt*" "inspect*" ...
#>   .. .. ..$ NEUTRAL     :List of 1
#>   .. .. .. ..$ : chr [1:38] "administr*" "advis*" "agenc*" "amalgamat*" ...
#>   .. .. ..$ RADICAL     :List of 1
#>   .. .. .. ..$ : chr [1:23] "abolition" "accountable" "answerable" "consult*" ...
#>   .. ..$ :List of 2
#>   .. .. ..$ LAW-CONSERVATIVE:List of 1
#>   .. .. .. ..$ : chr [1:52] "assaults" "bail" "burglar*" "constab*" ...
#>   .. .. ..$ LAW-LIBERAL     :List of 1
#>   .. .. .. ..$ : chr [1:2] "harassment" "non-custodial"
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr [1:16] "agricultur*" "badgers" "bird*" "countryside" ...
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr "town*"
#>   .. ..$ :List of 2
#>   .. .. ..$ CONSERVATIVE:List of 1
#>   .. .. .. ..$ : chr [1:32] "defend" "defended" "defending" "discipline" ...
#>   .. .. ..$ LIBERAL     :List of 1
#>   .. .. .. ..$ : chr [1:10] "cruel*" "discriminat*" "human*" "injustice*" ...
#>   ..@ meta :List of 3
#>   .. ..$ system:List of 5
#>   .. .. ..$ package-version:Classes 'package_version', 'numeric_version'  hidden list of 1
#>   .. .. .. ..$ : int [1:3] 2 1 2
#>   .. .. ..$ r-version      :Classes 'R_system_version', 'package_version', 'numeric_version'  hidden list of 1
#>   .. .. .. ..$ : int [1:3] 4 0 3
#>   .. .. ..$ system         : Named chr [1:3] "Windows" "x86-64" "Akos"
#>   .. .. .. ..- attr(*, "names")= chr [1:3] "sysname" "machine" "user"
#>   .. .. ..$ directory      : chr "E:/Dropbox/teaching/poltext/text_mining_workshop/04_dictionary"
#>   .. .. ..$ created        : Date[1:1], format: "2020-11-25"
#>   .. ..$ object:List of 2
#>   .. .. ..$ valuetype: chr "glob"
#>   .. .. ..$ separator: chr " "
#>   .. ..$ user  : list()

Let’s dig around a bit.

sample(lg_dict$ECONOMY$`+STATE+`, 5)
#> [1] "vulnerable"    "pension"       "accommodation" "hardship"     
#> [5] "classes"


sample(lg_dict$LAW_AND_ORDER$`LAW-CONSERVATIVE`, 5)
#> [1] "punish*"     "force*"      "unlawful"    "trafficker*" "joy-ride*"

2 LSD on british party manifesto

Load the UK election manifestos corpus from the data/ folder, or from the quanteda.corpora package.

load("data/data_corpus_ukmanifestos.rda")

summary(data_corpus_ukmanifestos)
#> Corpus consisting of 101 documents, showing 100 documents:
#> 
#>                    Text Types Tokens Sentences Country Type Year Language
#>     UK_natl_1945_en_Con  1752   6679       269      UK natl 1945       en
#>     UK_natl_1945_en_Lab  1433   5492       234      UK natl 1945       en
#>     UK_natl_1945_en_Lib  1208   3729       157      UK natl 1945       en
#>     UK_natl_1950_en_Con  2075   8075       366      UK natl 1950       en
#>     UK_natl_1950_en_Lab  1541   5392       274      UK natl 1950       en
#>     UK_natl_1950_en_Lib  1202   3322       136      UK natl 1950       en
#>     UK_natl_1951_en_Con  1006   2892       140      UK natl 1951       en
#>     UK_natl_1951_en_Lab   736   1996       117      UK natl 1951       en
#>     UK_natl_1951_en_Lib   816   2286       103      UK natl 1951       en
#>     UK_natl_1955_en_Con  2632  12944       608      UK natl 1955       en
#>     UK_natl_1955_en_Lab  1130   3183       146      UK natl 1955       en
#>     UK_natl_1959_en_Con  1676   5612       250      UK natl 1959       en
#>     UK_natl_1959_en_Lab  1676   5612       250      UK natl 1959       en
#>     UK_natl_1959_en_Lib   953   2823       139      UK natl 1959       en
#>     UK_natl_1964_en_Con  2246   8807       412      UK natl 1964       en
#>     UK_natl_1964_en_Lab  2493  10776       360      UK natl 1964       en
#>     UK_natl_1964_en_Lib  1422   4547       215      UK natl 1964       en
#>     UK_natl_1966_en_Con  1513   5071       277      UK natl 1966       en
#>     UK_natl_1966_en_Lab  2645  12043       569      UK natl 1966       en
#>     UK_natl_1966_en_Lib  1847   6544       338      UK natl 1966       en
#>     UK_natl_1970_en_Con  2331  11645       547      UK natl 1970       en
#>     UK_natl_1970_en_Lab  2661  13016       529      UK natl 1970       en
#>     UK_natl_1970_en_Lib  1096   3151       184      UK natl 1970       en
#>     UK_natl_1974_en_Con  2817  15562       662      UK natl 1974       en
#>     UK_natl_1974_en_Lab  2205   9588       367      UK natl 1974       en
#>     UK_natl_1974_en_Lib  2515  10145       353      UK natl 1974       en
#>     UK_natl_1979_en_Con  8092  98066      4668      UK natl 1979       en
#>     UK_natl_1979_en_Lab  6956  78274      3621      UK natl 1979       en
#>     UK_natl_1979_en_Lib  2119   7738       336      UK natl 1979       en
#>     UK_natl_1983_en_Con  2774  13258       606      UK natl 1983       en
#>     UK_natl_1983_en_Lab  3682  25455      1114      UK natl 1983       en
#>  UK_natl_1983_en_LibSDP  2893  14570       495      UK natl 1983       en
#>     UK_natl_1987_en_Con  3517  19682       914      UK natl 1987       en
#>     UK_natl_1987_en_Lab  2389  10012       450      UK natl 1987       en
#>  UK_natl_1987_en_LibSDP  3326  21349       636      UK natl 1987       en
#>     UK_natl_1992_en_Con  4676  33116      1585      UK natl 1992       en
#>     UK_natl_1992_en_Lab  2384  12551       562      UK natl 1992       en
#>      UK_natl_1992_en_LD  3170  19435       792      UK natl 1992       en
#>    UK_natl_1997_en_Comm  1082   3347       215      UK natl 1997       en
#>     UK_natl_1997_en_Con  3181  23388      1188      UK natl 1997       en
#>     UK_natl_1997_en_DUP   985   2930       138      UK natl 1997       en
#>     UK_natl_1997_en_Lab  3017  19356       822      UK natl 1997       en
#>      UK_natl_1997_en_LD  2824  15988       852      UK natl 1997       en
#>      UK_natl_1997_en_ND  3489  19028       842      UK natl 1997       en
#>     UK_natl_1997_en_NIA  1845   7510       321      UK natl 1997       en
#>      UK_natl_1997_en_PA  1027   2754       120      UK natl 1997       en
#>     UK_natl_1997_en_PCy  3278  17891       765      UK natl 1997       en
#>     UK_natl_1997_en_PUP   548   1326        91      UK natl 1997       en
#>     UK_natl_1997_en_SEP  2028   8394       350      UK natl 1997       en
#>      UK_natl_1997_en_SF  1792   6540       254      UK natl 1997       en
#>     UK_natl_1997_en_SGr   771   2012        95      UK natl 1997       en
#>      UK_natl_1997_en_TW  2293   7971       304      UK natl 1997       en
#>    UK_natl_1997_en_UKIP  2826  13089       488      UK natl 1997       en
#>     UK_natl_1997_en_UUP  2261   9244       392      UK natl 1997       en
#>    UK_natl_2001_en_Comm  1653   5020       201      UK natl 2001       en
#>     UK_natl_2001_en_Con  2841  14564       669      UK natl 2001       en
#>     UK_natl_2001_en_Dem  1850   6087       199      UK natl 2001       en
#>      UK_natl_2001_en_LA   457    998        49      UK natl 2001       en
#>     UK_natl_2001_en_Lab  4213  32350      1512      UK natl 2001       en
#>      UK_natl_2001_en_LD  3924  23643      1221      UK natl 2001       en
#>    UK_natl_2001_en_MRLP   977   2170       103      UK natl 2001       en
#>      UK_natl_2001_en_PA   996   2678       124      UK natl 2001       en
#>     UK_natl_2001_en_PCy  1678   7124       246      UK natl 2001       en
#>    UK_natl_2001_en_SCon  3060  16281       653      UK natl 2001       en
#>      UK_natl_2001_en_SF  2073   8070       487      UK natl 2001       en
#>    UK_natl_2001_en_SLab  4139  31615      1372      UK natl 2001       en
#>     UK_natl_2001_en_SLD  2620  12184       572      UK natl 2001       en
#>     UK_natl_2001_en_SNP  2440  11348       457      UK natl 2001       en
#>    UK_natl_2001_en_SSoc  1938   7460       333      UK natl 2001       en
#>   UK_natl_2001_en_Stuck   704   1568        63      UK natl 2001       en
#>     UK_natl_2001_en_UUP  1277   4120       162      UK natl 2001       en
#>     UK_natl_2005_en_BNP  5027  28853      1030      UK natl 2005       en
#>     UK_natl_2005_en_CAP   573   1293        55      UK natl 2005       en
#>     UK_natl_2005_en_Con  2096   8493       388      UK natl 2005       en
#>      UK_natl_2005_en_CP  1935   6630       210      UK natl 2005       en
#>     UK_natl_2005_en_DUP  2772  10423       324      UK natl 2005       en
#>     UK_natl_2005_en_EDP  1911   8085       382      UK natl 2005       en
#>     UK_natl_2005_en_EIP  2354   8768       355      UK natl 2005       en
#>     UK_natl_2005_en_FSP   304    689        32      UK natl 2005       en
#>      UK_natl_2005_en_FW  1822   7068       205      UK natl 2005       en
#>      UK_natl_2005_en_Gr  2535  12255       424      UK natl 2005       en
#>     UK_natl_2005_en_IGV  3210  13219       563      UK natl 2005       en
#>     UK_natl_2005_en_Lab  4117  26688       879      UK natl 2005       en
#>      UK_natl_2005_en_LD  3300  18030       816      UK natl 2005       en
#>      UK_natl_2005_en_MK  1594   5830       237      UK natl 2005       en
#>    UK_natl_2005_en_OMRL  1089   2934       131      UK natl 2005       en
#>     UK_natl_2005_en_PCy  1974   8485       308      UK natl 2005       en
#>      UK_natl_2005_en_PP   992   2874       165      UK natl 2005       en
#>     UK_natl_2005_en_PVP   934   3648       157      UK natl 2005       en
#>    UK_natl_2005_en_Resp  2886  13327       475      UK natl 2005       en
#>      UK_natl_2005_en_RT  1336   3776       128      UK natl 2005       en
#>    UK_natl_2005_en_SDLP  3247  16788       761      UK natl 2005       en
#>      UK_natl_2005_en_SF  4355  22598       607      UK natl 2005       en
#>     UK_natl_2005_en_SNP  1063   2989       169      UK natl 2005       en
#>      UK_natl_2005_en_SP  1407   5173       259      UK natl 2005       en
#>     UK_natl_2005_en_SSP  4339  23662      1046      UK natl 2005       en
#>      UK_natl_2005_en_TW  1710   5703       254      UK natl 2005       en
#>    UK_natl_2005_en_UKIP  2506  10251       406      UK natl 2005       en
#>     UK_natl_2005_en_UUP  1332   4144       150      UK natl 2005       en
#>     UK_natl_2005_en_Ver  1182   3498       110      UK natl 2005       en
#>   Party
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>     Lib
#>     Con
#>     Lab
#>  LibSDP
#>     Con
#>     Lab
#>  LibSDP
#>     Con
#>     Lab
#>      LD
#>    Comm
#>     Con
#>     DUP
#>     Lab
#>      LD
#>      ND
#>     NIA
#>      PA
#>     PCy
#>     PUP
#>     SEP
#>      SF
#>     SGr
#>      TW
#>    UKIP
#>     UUP
#>    Comm
#>     Con
#>     Dem
#>      LA
#>     Lab
#>      LD
#>    MRLP
#>      PA
#>     PCy
#>    SCon
#>      SF
#>    SLab
#>     SLD
#>     SNP
#>    SSoc
#>   Stuck
#>     UUP
#>     BNP
#>     CAP
#>     Con
#>      CP
#>     DUP
#>     EDP
#>     EIP
#>     FSP
#>      FW
#>      Gr
#>     IGV
#>     Lab
#>      LD
#>      MK
#>    OMRL
#>     PCy
#>      PP
#>     PVP
#>    Resp
#>      RT
#>    SDLP
#>      SF
#>     SNP
#>      SP
#>     SSP
#>      TW
#>    UKIP
#>     UUP
#>     Ver
#>                                                                          _source
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1945_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1945_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1945_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1950_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1950_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1950_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1951_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1951_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1951_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1955_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1955_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1959_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1959_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1959_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1964_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1964_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1964_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1966_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1966_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1966_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1979_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1979_en_Lab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1979_en_Lib.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1983_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1983_en_Lab.txt
#>  /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1983_en_LibSDP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1987_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1987_en_Lab.txt
#>  /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1987_en_LibSDP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1992_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1992_en_Lab.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1992_en_LD.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_Comm.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_DUP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_Lab.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_LD.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_ND.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_NIA.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_PA.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_PCy.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_PUP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_SEP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_SF.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_SGr.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_TW.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_UKIP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1997_en_UUP.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_Comm.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_Con.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_Dem.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_LA.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_Lab.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_LD.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_MRLP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_PA.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_PCy.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SCon.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SF.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SLab.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SLD.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SNP.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_SSoc.txt
#>   /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_Stuck.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2001_en_UUP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_BNP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_CAP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_Con.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_CP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_DUP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_EDP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_EIP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_FSP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_FW.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_Gr.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_IGV.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_Lab.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_LD.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_MK.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_OMRL.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_PCy.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_PP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_PVP.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_Resp.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_RT.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_SDLP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_SF.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_SNP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_SP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_SSP.txt
#>      /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_TW.txt
#>    /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_UKIP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_UUP.txt
#>     /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_2005_en_Ver.txt

We don’t really need all this, so let’s subset our corpus for the 2005 election.

uk_subset <- corpus_subset(data_corpus_ukmanifestos, Year >= 1970 & Year <= 1974)

uk_subset
#> Corpus consisting of 6 documents and 6 docvars.
#> UK_natl_1970_en_Con :
#> "CONSERVATIVE PARTY: 1970  A Better Tomorrow FOREWORD  This M..."
#> 
#> UK_natl_1970_en_Lab :
#> "Labour Party: 1970  NOW BRITAIN'S STRONG - LET'S MAKE IT GRE..."
#> 
#> UK_natl_1970_en_Lib :
#> "LIBERAL PARTY: 1970  What a Life! There must surely be a bet..."
#> 
#> UK_natl_1974_en_Con :
#> "Conservative Manifesto: October 1974  Putting Britain First ..."
#> 
#> UK_natl_1974_en_Lab :
#> "The Labour Party Manifesto: October, 1974  BRITAIN WILL WIN ..."
#> 
#> UK_natl_1974_en_Lib :
#> "The Liberal Party:  'Change the face of Britain' THE CRISIS ..."

We know that we want to use the Lexicoder dictionary to estimate the positive or negative sentiments in these party manifestos. It is part of quanteda, as data_dictionary_LSD2015. Let’s look around.

dict_lsd <- data_dictionary_LSD2015

summary(dict_lsd)
#>              Length Class  Mode     
#> negative     2858   -none- character
#> positive     1709   -none- character
#> neg_positive 1721   -none- character
#> neg_negative 2860   -none- character

str(dict_lsd)
#> Formal class 'dictionary2' [package "quanteda"] with 2 slots
#>   ..@ .Data:List of 4
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr [1:2858] "a lie" "abandon*" "abas*" "abattoir*" ...
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr [1:1709] "ability*" "abound*" "absolv*" "absorbent*" ...
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr [1:1721] "best not" "better not" "no damag*" "no no" ...
#>   .. ..$ :List of 1
#>   .. .. ..$ : chr [1:2860] "not a lie" "not abandon*" "not abas*" "not abattoir*" ...
#>   ..@ meta :List of 3
#>   .. ..$ system:List of 5
#>   .. .. ..$ package-version:Classes 'package_version', 'numeric_version'  hidden list of 1
#>   .. .. .. ..$ : int [1:3] 1 9 9009
#>   .. .. ..$ r-version      :Classes 'R_system_version', 'package_version', 'numeric_version'  hidden list of 1
#>   .. .. .. ..$ : int [1:3] 3 6 2
#>   .. .. ..$ system         : Named chr [1:3] "Darwin" "x86_64" "kbenoit"
#>   .. .. .. ..- attr(*, "names")= chr [1:3] "sysname" "machine" "user"
#>   .. .. ..$ directory      : chr "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda"
#>   .. .. ..$ created        : Date[1:1], format: "2020-02-17"
#>   .. ..$ object:List of 2
#>   .. .. ..$ valuetype: chr "glob"
#>   .. .. ..$ separator: chr " "
#>   .. ..$ user  :List of 6
#>   .. .. ..$ title      : chr "Lexicoder Sentiment Dictionary (2015)"
#>   .. .. ..$ description: chr "The 2015 Lexicoder Sentiment Dictionary in quanteda dictionary format.  \n\nThe dictionary consists of 2,858 \""| __truncated__
#>   .. .. ..$ source     : chr "Young, L. & Soroka, S. (2012). Affective News: The Automated Coding of Sentiment in Political Texts. Political "| __truncated__
#>   .. .. ..$ url        : chr "http://lexicoder.com"
#>   .. .. ..$ license    : chr "The LSD is available for non-commercial academic purposes only. By using data_dictionary_LSD2015, you accept th"| __truncated__
#>   .. .. ..$ keywords   : chr [1:4] "political" "news" "sentiment" "media"

sample(dict_lsd$negative, 10)
#>  [1] "browbeaten*" "ludicrous*"  "balk"        "unlikely"    "subver*"    
#>  [6] "paralyze*"   "doleful*"    "critiqu*"    "helpless*"   "drowsy*"

sample(dict_lsd$positive, 10)
#>  [1] "marvel*"       "fondness"      "tenderness"    "reconcil*"    
#>  [5] "impresses"     "cheery*"       "sanitary*"     "dispensatory*"
#>  [9] "safety*"       "accurat*"

sample(dict_lsd$neg_positive, 10)
#>  [1] "not precis*"    "not coaction*"  "not correcting" "not comfort"   
#>  [5] "not warmed"     "not righted"    "not humored*"   "not loyal*"    
#>  [9] "not attain*"    "not valour*"

What decisions should we make when we create our document-feature matrix?

uk_dfm <- dfm(uk_subset, tolower = TRUE, remove_punct = TRUE, remove = stopwords("en"))

To apply the dictionary, we use the dictionary argument of the dfm function.

uk_sentiment <- dfm(uk_dfm, dictionary = dict_lsd)

uk_sentiment
#> Document-feature matrix of: 6 documents, 4 features (50.0% sparse) and 6 docvars.
#>                      features
#> docs                  negative positive neg_positive neg_negative
#>   UK_natl_1970_en_Con      349      782            0            0
#>   UK_natl_1970_en_Lab      330      698            0            0
#>   UK_natl_1970_en_Lib       85      177            0            0
#>   UK_natl_1974_en_Con      437      899            0            0
#>   UK_natl_1974_en_Lab      273      500            0            0
#>   UK_natl_1974_en_Lib      350      495            0            0

Let’s add the net sentiment to the docvars of our corpus. We should also create a net sentiment from the positive and negative categories.

docvars(uk_subset, "net_sentiment") <- as.numeric(uk_sentiment[, 2]) - as.numeric(uk_sentiment[, 1])

docvars(uk_subset)
#>   Country Type Year Language Party
#> 1      UK natl 1970       en   Con
#> 2      UK natl 1970       en   Lab
#> 3      UK natl 1970       en   Lib
#> 4      UK natl 1974       en   Con
#> 5      UK natl 1974       en   Lab
#> 6      UK natl 1974       en   Lib
#>                                                                        _source
#> 1 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Con.txt
#> 2 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Lab.txt
#> 3 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1970_en_Lib.txt
#> 4 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Con.txt
#> 5 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Lab.txt
#> 6 /Users/kbenoit/Dropbox/QUANTESS/corpora/ukManRenamed/UK_natl_1974_en_Lib.txt
#>   net_sentiment
#> 1           433
#> 2           368
#> 3            92
#> 4           462
#> 5           227
#> 6           145

Time to plot! First we create a data frame from our corpus variables.

uk_df <- as.data.frame(docvars(uk_subset)) %>% 
    tibble::rownames_to_column("text")

ggplot(data = uk_df, mapping = aes(x = Year, y = net_sentiment, color = Party)) +
    geom_point(aes(shape = Party), size = 2.5) + 
    geom_line(alpha = 0.5) + 
    theme_minimal()

Did we miss anything? What are some additional steps that we would do in a real analysis?

3 NRC on Tweets

As the next demonstration we will use another type of dictionary on a different type of corpus. The NRC Word-Emotion Association Lexicon comes from the quanteda.dictionaries package, and the corpus is either loaded from the data/ folder or from the quanteda.corpora package.

nrc_dict <- quanteda.dictionaries::data_dictionary_NRC

summary(nrc_dict)
#>              Length Class  Mode     
#> anger        1247   -none- character
#> anticipation  839   -none- character
#> disgust      1058   -none- character
#> fear         1476   -none- character
#> joy           689   -none- character
#> negative     3324   -none- character
#> positive     2312   -none- character
#> sadness      1191   -none- character
#> surprise      534   -none- character
#> trust        1231   -none- character

The corpus we use is Trump tweets in the 2016 election campaign. An interesting analysis (where the data is coming from) is here from David Robinson: http://varianceexplained.org/r/trump-tweets/. We’ll replicate some of that work with quanteda.

load("data/trump_tweets.Rda")

glimpse(trump_tweets_df)
#> Rows: 1,512
#> Columns: 16
#> $ text          <chr> "My economic policy speech will be carried live at 12...
#> $ favorited     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
#> $ favoriteCount <dbl> 9214, 6981, 15724, 19837, 34051, 29831, 19223, 19543,...
#> $ replyToSN     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ created       <dttm> 2016-08-08 15:20:44, 2016-08-08 13:28:20, 2016-08-08...
#> $ truncated     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
#> $ replyToSID    <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ id            <chr> "762669882571980801", "762641595439190016", "76243965...
#> $ replyToUID    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ statusSource  <chr> "<a href=\"http://twitter.com/download/android\" rel=...
#> $ screenName    <chr> "realDonaldTrump", "realDonaldTrump", "realDonaldTrum...
#> $ retweetCount  <dbl> 3107, 2390, 6691, 6402, 11717, 9892, 5784, 7930, 2466...
#> $ isRetweet     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
#> $ retweeted     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS...
#> $ longitude     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
#> $ latitude      <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...

Let’s make some adjustments to our data. We want to extract the device the tweet is made and only keep some relevant variables.

trump_tweets_df <- trump_tweets_df %>% 
    mutate(device = str_extract(statusSource, "android|iphone")) %>% 
    filter(isRetweet == FALSE, device %in% c("android", "iphone")) %>% 
    select(text, device, created, favoriteCount, retweetCount)
trump_corpus <- corpus(trump_tweets_df)

Let’s apply our dictionary, but now with the dfm_lookup function.

trump_sentiment <- trump_corpus %>% 
    dfm() %>% 
    dfm_lookup(dictionary = nrc_dict)

head(trump_sentiment, 15)
#> Document-feature matrix of: 15 documents, 10 features (68.7% sparse) and 4 docvars.
#>        features
#> docs    anger anticipation disgust fear joy negative positive sadness surprise
#>   text1     0            1       0    0   1        0        2       0        0
#>   text2     0            1       0    0   0        0        1       0        0
#>   text3     0            0       0    0   0        0        1       1        1
#>   text4     1            1       1    1   0        2        1       1        0
#>   text5     2            0       1    1   1        2        1       2        0
#>   text6     0            0       0    0   0        1        2       0        0
#>        features
#> docs    trust
#>   text1     2
#>   text2     0
#>   text3     1
#>   text4     2
#>   text5     0
#>   text6     0
#> [ reached max_ndoc ... 9 more documents ]

we add the sentiment scores back to our original dataset.

trump_sentiment_df <- convert(trump_sentiment, to = "data.frame")

trump_tweets_df <- bind_cols(trump_tweets_df, trump_sentiment_df)

Some plots to satisfy our curiosity

ggplot(trump_tweets_df, aes(x = anger, y = log(retweetCount), color = device)) +
    geom_point(alpha = 0.45) +
    theme_minimal()

summary(lm(log(retweetCount) ~ anger + fear + joy + negative + positive + sadness + surprise + trust + factor(device), data = trump_tweets_df))
#> 
#> Call:
#> lm(formula = log(retweetCount) ~ anger + fear + joy + negative + 
#>     positive + sadness + surprise + trust + factor(device), data = trump_tweets_df)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -2.0892 -0.4347 -0.0081  0.4159  3.5490 
#> 
#> Coefficients:
#>                       Estimate Std. Error t value Pr(>|t|)    
#> (Intercept)           8.337912   0.033336 250.114  < 2e-16 ***
#> anger                -0.002952   0.041074  -0.072 0.942715    
#> fear                  0.169597   0.036406   4.658 3.49e-06 ***
#> joy                  -0.096870   0.038588  -2.510 0.012174 *  
#> negative              0.116316   0.031753   3.663 0.000259 ***
#> positive              0.010661   0.023968   0.445 0.656542    
#> sadness              -0.057231   0.042555  -1.345 0.178886    
#> surprise             -0.093099   0.034935  -2.665 0.007790 ** 
#> trust                 0.117524   0.028227   4.163 3.33e-05 ***
#> factor(device)iphone  0.153878   0.035580   4.325 1.64e-05 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 0.63 on 1380 degrees of freedom
#> Multiple R-squared:  0.09526,    Adjusted R-squared:  0.08936 
#> F-statistic: 16.14 on 9 and 1380 DF,  p-value: < 2.2e-16

Visualize our results

ols <- lm(log(retweetCount) ~ anger + fear + joy + negative + positive + sadness + surprise + trust + factor(device), data = trump_tweets_df)

ols_coef <- tidy(ols, conf.int = TRUE)

ggplot(ols_coef[2:10,], aes(estimate, term, color = term)) +
    geom_point()+
    geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
    geom_vline(xintercept = 0, size = 1, linetype = "dotted") +
    theme_minimal()

4 Create dictionary

As quick excercise we’ll create a dictionary for Trump tweets. For inspiration: https://www.nytimes.com/interactive/2016/01/28/upshot/donald-trump-twitter-insults.html

dict_trump <- dictionary(list(insult = c("crooked", "fake", "crazy", "failing", "wrong", "phony"),
                              happy = c("great", "proud", "deal", "approval")))

trump_insult <- trump_corpus %>% 
    dfm() %>% 
    dfm_lookup(dictionary = dict_trump)

head(trump_insult)
#> Document-feature matrix of: 6 documents, 2 features (83.3% sparse) and 4 docvars.
#>        features
#> docs    insult happy
#>   text1      0     0
#>   text2      0     0
#>   text3      0     0
#>   text4      0     0
#>   text5      1     0
#>   text6      1     0