Word Vectors for Treaties
Experimenting with word vectors on Native American treaties data.
suppressPackageStartupMessages(library(text2vec))
## Warning: package 'text2vec' was built under R version 3.4.3
suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
suppressPackageStartupMessages(library(wordVectors))
suppressPackageStartupMessages(library(tsne))
#prep_word2vec("data/", "treaty_all.txt", lowercase = T)
#treaty_model <- train_word2vec("data/treaty_all.txt", output = "data/treaty_all_vectors.bin", threads = 6, vectors = 100, window = 12)
treaty_model <- read.vectors("data/treaty_all_vectors.bin")
## Filename ends with .bin, so reading in binary format
## Reading a word2vec binary file of 8080 rows and 100 columns
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|====== | 10%
|
|======= | 10%
|
|======= | 11%
|
|======= | 12%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========= | 15%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 25%
|
|================= | 26%
|
|================= | 27%
|
|================== | 27%
|
|================== | 28%
|
|=================== | 28%
|
|=================== | 29%
|
|=================== | 30%
|
|==================== | 30%
|
|==================== | 31%
|
|==================== | 32%
|
|===================== | 32%
|
|===================== | 33%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|============================ | 42%
|
|============================ | 43%
|
|============================ | 44%
|
|============================= | 44%
|
|============================= | 45%
|
|============================== | 45%
|
|============================== | 46%
|
|============================== | 47%
|
|=============================== | 47%
|
|=============================== | 48%
|
|================================ | 48%
|
|================================ | 49%
|
|================================ | 50%
|
|================================= | 50%
|
|================================= | 51%
|
|================================= | 52%
|
|================================== | 52%
|
|================================== | 53%
|
|=================================== | 53%
|
|=================================== | 54%
|
|=================================== | 55%
|
|==================================== | 55%
|
|==================================== | 56%
|
|===================================== | 56%
|
|===================================== | 57%
|
|===================================== | 58%
|
|====================================== | 58%
|
|====================================== | 59%
|
|======================================= | 59%
|
|======================================= | 60%
|
|======================================= | 61%
|
|======================================== | 61%
|
|======================================== | 62%
|
|========================================= | 62%
|
|========================================= | 63%
|
|========================================= | 64%
|
|========================================== | 64%
|
|========================================== | 65%
|
|=========================================== | 65%
|
|=========================================== | 66%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|============================================== | 72%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|================================================ | 74%
|
|================================================ | 75%
|
|================================================= | 75%
|
|================================================= | 76%
|
|================================================== | 76%
|
|================================================== | 77%
|
|================================================== | 78%
|
|=================================================== | 78%
|
|=================================================== | 79%
|
|==================================================== | 79%
|
|==================================================== | 80%
|
|==================================================== | 81%
|
|===================================================== | 81%
|
|===================================================== | 82%
|
|====================================================== | 82%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|========================================================== | 88%
|
|========================================================== | 89%
|
|========================================================== | 90%
|
|=========================================================== | 90%
|
|=========================================================== | 91%
|
|=========================================================== | 92%
|
|============================================================ | 92%
|
|============================================================ | 93%
|
|============================================================= | 93%
|
|============================================================= | 94%
|
|============================================================= | 95%
|
|============================================================== | 95%
|
|============================================================== | 96%
|
|=============================================================== | 96%
|
|=============================================================== | 97%
|
|=============================================================== | 98%
|
|================================================================ | 98%
|
|================================================================ | 99%
|
|=================================================================| 99%
|
|=================================================================| 100%
And lets look at a few things.
treaty_model %>% nearest_to(treaty_model[["sioux"]])
## sioux dakota brulé yanktonai arcs
## 5.551115e-16 2.600296e-01 2.923641e-01 3.072613e-01 3.526654e-01
## onkpahpah represented wahpakoota o'gallala yanktonais
## 3.639902e-01 3.830032e-01 3.850588e-01 3.903711e-01 3.926642e-01
treaty_model %>% nearest_to(treaty_model[["land"]])
## land tract contiguous adjoining acres
## 4.440892e-16 2.114601e-01 3.266464e-01 3.290219e-01 3.371009e-01
## sections reserved containing tracts contain
## 3.729826e-01 3.748342e-01 3.750511e-01 3.800679e-01 3.811347e-01
treaty_model %>% nearest_to(treaty_model[["women"]])
## women kill capture scalp intermarried
## 2.220446e-16 4.120251e-01 4.306975e-01 4.316945e-01 4.437425e-01
## females children coaches ages female
## 4.552839e-01 4.709732e-01 4.903550e-01 4.944802e-01 4.956098e-01
treaty_model %>% nearest_to(treaty_model[["privilege"]])
## privilege gathering roots hunting pasturing
## 3.330669e-16 2.737482e-01 3.131073e-01 3.181663e-01 3.594188e-01
## curing berries enjoy unclaimed grazing
## 3.646737e-01 3.731652e-01 3.975566e-01 4.358794e-01 4.417746e-01
treaty_model %>% nearest_to(treaty_model[[c("land","water")]])
## water land tract beard's front parcel
## 0.1467987 0.3181458 0.3284152 0.3795273 0.4030645 0.4111357
## parcels lying begining contiguous
## 0.4130324 0.4161033 0.4219893 0.4316613
treaty_model %>% nearest_to(treaty_model[[c("sovereign","sovereignty")]])
## sovereign sovereignty whatsoever whosoever power iaway
## 0.08647106 0.24945685 0.34318705 0.40012940 0.45668615 0.53273085
## whatever piankishaw release covenants
## 0.53772180 0.54511466 0.55134641 0.55368069
treaty_model %>% nearest_to(treaty_model[[c("survey","remove")]])
## survey remove removed removal within surveyed
## 0.1718791 0.2440431 0.3752076 0.4286823 0.4327824 0.4359293
## selection reservation homes expense
## 0.4459670 0.4564428 0.4565563 0.4612289
treaty_model %>% nearest_to(treaty_model[[c("never","until","unless")]])
## unless never until otherwise nor exchanged but
## 0.2508166 0.2509114 0.2756127 0.3032240 0.3572091 0.3970306 0.4010554
## leased not disposed
## 0.4017218 0.4084792 0.4156061
treaty_model %>% nearest_to(treaty_model[[c("sovereign","survey","remove","privilege","annum","liquor")]])
## liquor paying bring restraint intrude excess
## 0.4130184 0.4328057 0.4404356 0.4514574 0.4537095 0.4561355
## any procedure diminished lien
## 0.4573935 0.4596945 0.4644880 0.4717117
treaty_model %>% nearest_to(treaty_model[[c("rights","citizen","citizens","independent","sovereign","sovereignty")]])
## citizens whosoever sovereign subjects whatsoever citizen
## 0.2185664 0.3021226 0.3092277 0.3426967 0.3613456 0.3613839
## all intentions property whatever
## 0.3780966 0.3795283 0.3900762 0.3991890
treaty_model %>% nearest_to(treaty_model[[c("savage","depredation","depredations","civilized","hostile","friendly","enlightened")]])
## depredations friendly aggression depredation commit
## 0.1706874 0.2559353 0.2751960 0.2868044 0.2988888
## community violations intentions refer resort
## 0.2992553 0.3036988 0.3124136 0.3254526 0.3343209
treaty_model %>% nearest_to(treaty_model[[c("religion","chritianity","missionaries","mission")]])
## mission religion catholic enclosures school
## 0.2205716 0.3022282 0.3034389 0.3053957 0.3285781
## missionaries graveyard solely society churches
## 0.3364435 0.3476846 0.3540666 0.3735994 0.3815150
treaty_model %>% nearest_to(treaty_model[[c("fishing","hunting")]])
## fishing hunting grounds gathering roots berries
## 0.09345720 0.09849956 0.22297792 0.22742867 0.26133680 0.29470077
## pasturing privilege temporary curing
## 0.30648832 0.31809462 0.32094083 0.34211069
treaty_model %>% nearest_to(treaty_model[[c("right","rights","privilege","privileges")]])
## rights privileges privilege right immunities enjoy
## 0.1380672 0.1385519 0.2476362 0.2645462 0.2727490 0.2898248
## secured acquired conceded all
## 0.3605596 0.3973752 0.4103890 0.4111070
treaty_model %>% nearest_to(treaty_model[[c("wooded","woods","grass","hills")]])
## hills woods grass gap pigeon cowee salmon
## 0.1888916 0.2129462 0.2185929 0.2691507 0.3083977 0.3173047 0.3217354
## sweet occunna trail
## 0.3264340 0.3288510 0.3393208
treaty_model %>% nearest_to(treaty_model[[c("education","school")]])
## education school schools taught support boarding manual
## 0.1100669 0.1163392 0.1793846 0.2619291 0.2725217 0.2875050 0.2914613
## managed labor pupils
## 0.3367332 0.3372301 0.3397492
treaty_model %>% nearest_to(treaty_model[[c("children","family")]])
## family children age each orphan upwards single
## 0.1634594 0.2001839 0.2417852 0.2830500 0.3193517 0.3240626 0.3266955
## child female male
## 0.3420196 0.3527515 0.3594355
treaty_model %>% nearest_to(treaty_model[["civilization"]])
## civilization improvement advancement promote welfare
## 2.220446e-16 2.166492e-01 2.304636e-01 2.642834e-01 2.651223e-01
## calculated advance tend improve prosperity
## 2.898264e-01 2.966781e-01 2.979192e-01 3.008211e-01 3.106876e-01
treaty_model %>% nearest_to(treaty_model[[c("nation","citizen","property", "ration")]])
## citizen property citizens stolen steal crime
## 0.1281904 0.1938332 0.2461862 0.3052806 0.3278432 0.3319556
## offense inhabitants wrongs oppressions
## 0.3573029 0.3764107 0.3768556 0.3845172
treaty_model %>% nearest_to(treaty_model[[c("trust","income","economy","money","currency","trade","annum","annuity")]])
## semi annum income annually paid cent money
## 0.2510550 0.2587554 0.2633063 0.2636353 0.2664192 0.2897399 0.2927063
## annuity invested amount
## 0.2930000 0.3052392 0.3081476
treaty_model %>% nearest_to(treaty_model[[c("governance","governed")]])
## governed conducting auction advertisement judicious
## 2.220446e-16 2.677828e-01 3.547297e-01 3.660093e-01 3.765908e-01
## system public bidders quantities expect
## 3.832375e-01 3.918471e-01 3.919071e-01 4.170087e-01 4.190062e-01
treaty_model %>% nearest_to(treaty_model[["border"]])
## border salamania enters salamanie mississinnewa
## 5.551115e-16 2.567465e-01 2.597104e-01 2.726215e-01 2.789152e-01
## salamany mississinewa commences opposite reserve.to
## 2.856063e-01 3.031653e-01 3.111223e-01 3.153699e-01 3.183517e-01
treaty_model %>% nearest_to(treaty_model[[c("slave","slavery")]])
## slavery servitude involuntary neither henceforth slave
## 0.1642028 0.2417072 0.2437172 0.3703176 0.4149254 0.4181242
## offenses prosecuted fellow unjust
## 0.4756016 0.4913933 0.4936202 0.5001157
treaty_model %>% nearest_to(treaty_model[["war"]])
## war britain late great club
## 1.110223e-16 4.478813e-01 4.537758e-01 5.288326e-01 5.357989e-01
## resort defense defence wars stood
## 5.419314e-01 5.460929e-01 5.668523e-01 5.799885e-01 5.822388e-01
treaty_model %>% nearest_to(treaty_model[[c("crime","prison","law")]])
## crime capital violence offence murder committed
## 0.1257522 0.1945093 0.2402509 0.2666520 0.2687466 0.2783448
## robbery conviction happened crimes
## 0.2878158 0.2909968 0.3136323 0.3146393
treaty_model %>% nearest_to(treaty_model[[c("law")]])
## law orders enjoined penalties highways
## 7.771561e-16 4.312067e-01 4.431849e-01 4.569171e-01 4.574289e-01
## ordinances duties dealt reclaimed pass
## 4.704276e-01 4.800670e-01 4.917946e-01 5.008678e-01 5.019812e-01
treaty_model %>% nearest_to(treaty_model[[c("citizen","citizenship")]])
## citizen citizenship citizens steal
## 0.1360345 0.2584039 0.3559836 0.3795100
## unlawfully become prosecuted indorsed
## 0.4056544 0.4123928 0.4124730 0.4203644
## capital responsibilities
## 0.4236191 0.4276640
treaty_model %>% nearest_to(treaty_model[[c("citizen")]])
## citizen inhabitant crime capital steal
## 2.220446e-16 3.178725e-01 3.313801e-01 3.591289e-01 3.712720e-01
## citizens murder stolen unlawfully any
## 3.772024e-01 4.333286e-01 4.352204e-01 4.352773e-01 4.470571e-01
treaty_model %>% nearest_to(treaty_model[[c("citizenship")]])
## citizenship responsibilities arranged vote
## 2.220446e-16 2.167933e-01 3.666116e-01 3.688635e-01
## become unfit industrious distinctions
## 3.992924e-01 4.185005e-01 4.211067e-01 4.259218e-01
## intending prepare
## 4.326889e-01 4.337953e-01
rights <- treaty_model[rownames(treaty_model),]
right_score <- rights %>% cosineSimilarity(treaty_model[[c("right","rights")]])
privilege_score <- rights %>% cosineSimilarity(treaty_model[[c("privilege","privileges")]])
plot(right_score,
privilege_score,
type = 'n',
main = "Words plotted by their similarity to rights \n(x axis) and privileges (y axis)."
)
text(right_score, privilege_score, labels = rownames(rights), cex = .7)
abline(a=0, b=1)
treaties <- treaty_model[rownames(treaty_model),]
lands <- treaties %>% cosineSimilarity(treaty_model[[c("land","lands")]])
water <- treaties %>% cosineSimilarity(treaty_model[[c("water", "waters")]])
plot(lands, water, type = 'n', main = "Top words plotted by their similarity to land \n and water")
text(lands, water, labels = rownames(treaties), cex=.7)
abline(a=0, b=1)