Word Vectors for Treaties

Experimenting with word vectors on Native American treaties data.

suppressPackageStartupMessages(library(text2vec))
## Warning: package 'text2vec' was built under R version 3.4.3
suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
suppressPackageStartupMessages(library(wordVectors))
suppressPackageStartupMessages(library(tsne))

#prep_word2vec("data/", "treaty_all.txt", lowercase = T)

#treaty_model <- train_word2vec("data/treaty_all.txt", output = "data/treaty_all_vectors.bin", threads = 6, vectors = 100, window = 12)
treaty_model <- read.vectors("data/treaty_all_vectors.bin")
## Filename ends with .bin, so reading in binary format
## Reading a word2vec binary file of 8080 rows and 100 columns
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |=======                                                          |  12%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |=========                                                        |  15%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |=================                                                |  27%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |====================                                             |  32%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |==============================                                   |  47%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |================================                                 |  50%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |=================================                                |  52%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |===================================                              |  55%
  |                                                                       
  |====================================                             |  55%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |=====================================                            |  58%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |=======================================                          |  60%
  |                                                                       
  |=======================================                          |  61%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |==============================================                   |  72%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |================================================                 |  75%
  |                                                                       
  |=================================================                |  75%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |===================================================              |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |====================================================             |  81%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |==========================================================       |  90%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |===========================================================      |  92%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |============================================================     |  93%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |=============================================================    |  95%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |===============================================================  |  98%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================|  99%
  |                                                                       
  |=================================================================| 100%

And lets look at a few things.

treaty_model %>% nearest_to(treaty_model[["sioux"]])
##        sioux       dakota        brulé    yanktonai         arcs 
## 5.551115e-16 2.600296e-01 2.923641e-01 3.072613e-01 3.526654e-01 
##    onkpahpah  represented   wahpakoota    o'gallala   yanktonais 
## 3.639902e-01 3.830032e-01 3.850588e-01 3.903711e-01 3.926642e-01
treaty_model %>% nearest_to(treaty_model[["land"]])
##         land        tract   contiguous    adjoining        acres 
## 4.440892e-16 2.114601e-01 3.266464e-01 3.290219e-01 3.371009e-01 
##     sections     reserved   containing       tracts      contain 
## 3.729826e-01 3.748342e-01 3.750511e-01 3.800679e-01 3.811347e-01
treaty_model %>% nearest_to(treaty_model[["women"]])
##        women         kill      capture        scalp intermarried 
## 2.220446e-16 4.120251e-01 4.306975e-01 4.316945e-01 4.437425e-01 
##      females     children      coaches         ages       female 
## 4.552839e-01 4.709732e-01 4.903550e-01 4.944802e-01 4.956098e-01
treaty_model %>% nearest_to(treaty_model[["privilege"]])
##    privilege    gathering        roots      hunting    pasturing 
## 3.330669e-16 2.737482e-01 3.131073e-01 3.181663e-01 3.594188e-01 
##       curing      berries        enjoy    unclaimed      grazing 
## 3.646737e-01 3.731652e-01 3.975566e-01 4.358794e-01 4.417746e-01
treaty_model %>% nearest_to(treaty_model[[c("land","water")]])
##      water       land      tract    beard's      front     parcel 
##  0.1467987  0.3181458  0.3284152  0.3795273  0.4030645  0.4111357 
##    parcels      lying   begining contiguous 
##  0.4130324  0.4161033  0.4219893  0.4316613
treaty_model %>% nearest_to(treaty_model[[c("sovereign","sovereignty")]])
##   sovereign sovereignty  whatsoever   whosoever       power       iaway 
##  0.08647106  0.24945685  0.34318705  0.40012940  0.45668615  0.53273085 
##    whatever  piankishaw     release   covenants 
##  0.53772180  0.54511466  0.55134641  0.55368069
treaty_model %>% nearest_to(treaty_model[[c("survey","remove")]])
##      survey      remove     removed     removal      within    surveyed 
##   0.1718791   0.2440431   0.3752076   0.4286823   0.4327824   0.4359293 
##   selection reservation       homes     expense 
##   0.4459670   0.4564428   0.4565563   0.4612289
treaty_model %>% nearest_to(treaty_model[[c("never","until","unless")]])
##    unless     never     until otherwise       nor exchanged       but 
## 0.2508166 0.2509114 0.2756127 0.3032240 0.3572091 0.3970306 0.4010554 
##    leased       not  disposed 
## 0.4017218 0.4084792 0.4156061
treaty_model %>% nearest_to(treaty_model[[c("sovereign","survey","remove","privilege","annum","liquor")]])
##     liquor     paying      bring  restraint    intrude     excess 
##  0.4130184  0.4328057  0.4404356  0.4514574  0.4537095  0.4561355 
##        any  procedure diminished       lien 
##  0.4573935  0.4596945  0.4644880  0.4717117
treaty_model %>% nearest_to(treaty_model[[c("rights","citizen","citizens","independent","sovereign","sovereignty")]])
##   citizens  whosoever  sovereign   subjects whatsoever    citizen 
##  0.2185664  0.3021226  0.3092277  0.3426967  0.3613456  0.3613839 
##        all intentions   property   whatever 
##  0.3780966  0.3795283  0.3900762  0.3991890
treaty_model %>% nearest_to(treaty_model[[c("savage","depredation","depredations","civilized","hostile","friendly","enlightened")]])
## depredations     friendly   aggression  depredation       commit 
##    0.1706874    0.2559353    0.2751960    0.2868044    0.2988888 
##    community   violations   intentions        refer       resort 
##    0.2992553    0.3036988    0.3124136    0.3254526    0.3343209
treaty_model %>% nearest_to(treaty_model[[c("religion","chritianity","missionaries","mission")]])
##      mission     religion     catholic   enclosures       school 
##    0.2205716    0.3022282    0.3034389    0.3053957    0.3285781 
## missionaries    graveyard       solely      society     churches 
##    0.3364435    0.3476846    0.3540666    0.3735994    0.3815150
treaty_model %>% nearest_to(treaty_model[[c("fishing","hunting")]])
##    fishing    hunting    grounds  gathering      roots    berries 
## 0.09345720 0.09849956 0.22297792 0.22742867 0.26133680 0.29470077 
##  pasturing  privilege  temporary     curing 
## 0.30648832 0.31809462 0.32094083 0.34211069
treaty_model %>% nearest_to(treaty_model[[c("right","rights","privilege","privileges")]])
##     rights privileges  privilege      right immunities      enjoy 
##  0.1380672  0.1385519  0.2476362  0.2645462  0.2727490  0.2898248 
##    secured   acquired   conceded        all 
##  0.3605596  0.3973752  0.4103890  0.4111070
treaty_model %>% nearest_to(treaty_model[[c("wooded","woods","grass","hills")]])
##     hills     woods     grass       gap    pigeon     cowee    salmon 
## 0.1888916 0.2129462 0.2185929 0.2691507 0.3083977 0.3173047 0.3217354 
##     sweet   occunna     trail 
## 0.3264340 0.3288510 0.3393208
treaty_model %>% nearest_to(treaty_model[[c("education","school")]])
## education    school   schools    taught   support  boarding    manual 
## 0.1100669 0.1163392 0.1793846 0.2619291 0.2725217 0.2875050 0.2914613 
##   managed     labor    pupils 
## 0.3367332 0.3372301 0.3397492
treaty_model %>% nearest_to(treaty_model[[c("children","family")]])
##    family  children       age      each    orphan   upwards    single 
## 0.1634594 0.2001839 0.2417852 0.2830500 0.3193517 0.3240626 0.3266955 
##     child    female      male 
## 0.3420196 0.3527515 0.3594355
treaty_model %>% nearest_to(treaty_model[["civilization"]])
## civilization  improvement  advancement      promote      welfare 
## 2.220446e-16 2.166492e-01 2.304636e-01 2.642834e-01 2.651223e-01 
##   calculated      advance         tend      improve   prosperity 
## 2.898264e-01 2.966781e-01 2.979192e-01 3.008211e-01 3.106876e-01
treaty_model %>% nearest_to(treaty_model[[c("nation","citizen","property", "ration")]])
##     citizen    property    citizens      stolen       steal       crime 
##   0.1281904   0.1938332   0.2461862   0.3052806   0.3278432   0.3319556 
##     offense inhabitants      wrongs oppressions 
##   0.3573029   0.3764107   0.3768556   0.3845172
treaty_model %>% nearest_to(treaty_model[[c("trust","income","economy","money","currency","trade","annum","annuity")]])
##      semi     annum    income  annually      paid      cent     money 
## 0.2510550 0.2587554 0.2633063 0.2636353 0.2664192 0.2897399 0.2927063 
##   annuity  invested    amount 
## 0.2930000 0.3052392 0.3081476
treaty_model %>% nearest_to(treaty_model[[c("governance","governed")]])
##      governed    conducting       auction advertisement     judicious 
##  2.220446e-16  2.677828e-01  3.547297e-01  3.660093e-01  3.765908e-01 
##        system        public       bidders    quantities        expect 
##  3.832375e-01  3.918471e-01  3.919071e-01  4.170087e-01  4.190062e-01
treaty_model %>% nearest_to(treaty_model[["border"]])
##        border     salamania        enters     salamanie mississinnewa 
##  5.551115e-16  2.567465e-01  2.597104e-01  2.726215e-01  2.789152e-01 
##      salamany  mississinewa     commences      opposite    reserve.to 
##  2.856063e-01  3.031653e-01  3.111223e-01  3.153699e-01  3.183517e-01
treaty_model %>% nearest_to(treaty_model[[c("slave","slavery")]])
##     slavery   servitude involuntary     neither  henceforth       slave 
##   0.1642028   0.2417072   0.2437172   0.3703176   0.4149254   0.4181242 
##    offenses  prosecuted      fellow      unjust 
##   0.4756016   0.4913933   0.4936202   0.5001157
treaty_model %>% nearest_to(treaty_model[["war"]])
##          war      britain         late        great         club 
## 1.110223e-16 4.478813e-01 4.537758e-01 5.288326e-01 5.357989e-01 
##       resort      defense      defence         wars        stood 
## 5.419314e-01 5.460929e-01 5.668523e-01 5.799885e-01 5.822388e-01
treaty_model %>% nearest_to(treaty_model[[c("crime","prison","law")]])
##      crime    capital   violence    offence     murder  committed 
##  0.1257522  0.1945093  0.2402509  0.2666520  0.2687466  0.2783448 
##    robbery conviction   happened     crimes 
##  0.2878158  0.2909968  0.3136323  0.3146393
treaty_model %>% nearest_to(treaty_model[[c("law")]])
##          law       orders     enjoined    penalties     highways 
## 7.771561e-16 4.312067e-01 4.431849e-01 4.569171e-01 4.574289e-01 
##   ordinances       duties        dealt    reclaimed         pass 
## 4.704276e-01 4.800670e-01 4.917946e-01 5.008678e-01 5.019812e-01
treaty_model %>% nearest_to(treaty_model[[c("citizen","citizenship")]])
##          citizen      citizenship         citizens            steal 
##        0.1360345        0.2584039        0.3559836        0.3795100 
##       unlawfully           become       prosecuted         indorsed 
##        0.4056544        0.4123928        0.4124730        0.4203644 
##          capital responsibilities 
##        0.4236191        0.4276640
treaty_model %>% nearest_to(treaty_model[[c("citizen")]])
##      citizen   inhabitant        crime      capital        steal 
## 2.220446e-16 3.178725e-01 3.313801e-01 3.591289e-01 3.712720e-01 
##     citizens       murder       stolen   unlawfully          any 
## 3.772024e-01 4.333286e-01 4.352204e-01 4.352773e-01 4.470571e-01
treaty_model %>% nearest_to(treaty_model[[c("citizenship")]])
##      citizenship responsibilities         arranged             vote 
##     2.220446e-16     2.167933e-01     3.666116e-01     3.688635e-01 
##           become            unfit      industrious     distinctions 
##     3.992924e-01     4.185005e-01     4.211067e-01     4.259218e-01 
##        intending          prepare 
##     4.326889e-01     4.337953e-01
rights <- treaty_model[rownames(treaty_model),]
right_score <- rights %>% cosineSimilarity(treaty_model[[c("right","rights")]])
privilege_score <- rights %>% cosineSimilarity(treaty_model[[c("privilege","privileges")]])

plot(right_score, 
     privilege_score, 
     type = 'n', 
     main = "Words plotted by their similarity to rights \n(x axis) and privileges (y axis)."
)
text(right_score, privilege_score, labels = rownames(rights), cex = .7)
abline(a=0, b=1)

treaties <- treaty_model[rownames(treaty_model),]
lands <- treaties %>% cosineSimilarity(treaty_model[[c("land","lands")]])
water <- treaties %>% cosineSimilarity(treaty_model[[c("water", "waters")]])

plot(lands, water, type = 'n', main = "Top words plotted by their similarity to land \n and water")
text(lands, water, labels = rownames(treaties), cex=.7)
abline(a=0, b=1)