Trump Tweets Text Analysis

Jason Heppler

Trump Tweets Text Analysis

Analyzing Trump tweets since January.

library(twitteR)
library(ROAuth)

# Twitter setup
setup_twitter_oauth(consumer_key = , 
                    consumer_secret = , 
                    access_token = ,
                    access_secret = )

## [1] "Using direct authentication"

# Grab tweets and convert into dataframe
tweets <- userTimeline("realdonaldtrump", n = 3200)
tweets.df <- twListToDF(tweets)

# Text cleaning
library(tm)

## Loading required package: NLP

corpus <- Corpus(VectorSource(tweets.df$text))
#corpus <- tm_map(corpus, content_transformer(tolower))
removeurl <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeurl))
removenumpunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removenumpunct))
mystopwords <- c(setdiff(stopwords('english'), c("r")),
                 "the", "amp")
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stripWhitespace)

# Term document matrix
tdm <- TermDocumentMatrix(corpus,
                          control = list(wordLengths = c(1, Inf)))
tdm

## <<TermDocumentMatrix (terms: 2062, documents: 470)>>
## Non-/sparse entries: 5679/963461
## Sparsity           : 99%
## Maximal term length: 26
## Weighting          : term frequency (tf)

idx <- which(dimnames(tdm)$Terms %in% c("obama", "hillary"))
as.matrix(tdm[idx, 1:20])

##          Docs
## Terms     1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##   hillary 0 0 0 0 0 0 0 1 1  0  0  0  0  0  0  0  0  0  0  0
##   obama   0 0 0 0 0 0 0 0 0  0  0  0  0  0  0  0  0  0  0  0

# Top frequenty terms
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms

##  [1] "big"        "great"      "will"       "people"     "amp"       
##  [6] "the"        "us"         "american"   "thank"      "democrats" 
## [11] "i"          "obamacare"  "healthcare" "fake"       "news"      
## [16] "many"       "america"    "russia"     "time"       "just"      
## [21] "new"        "president"  "election"   "it"         "media"     
## [26] "today"      "get"        "we"

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 20)
df <- data.frame(term = names(term.freq), freq = term.freq)

library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

ggplot(df, aes(x = term, y = freq)) +
  geom_bar(stat = "identity") +
  xlab("Terms") + ylab("Count") + coord_flip() +
  theme(axis.text = element_text(size = 7))

findAssocs(tdm, "obama", 0.2)

## $obama
## administration       meddling         crimea           soft         russia 
##           0.57           0.50           0.50           0.50           0.41 
##           took          taken            was          place        nothing 
##           0.37           0.35           0.35           0.32           0.30 
##            why       election           with       agencies      bombshell 
##           0.29           0.27           0.25           0.25           0.25 
##          check      dismissed           from            rig       informed 
##           0.25           0.25           0.25           0.25           0.25 
##       notified         choked           told        advance       november 
##           0.25           0.25           0.25           0.25           0.25 
##             by      appointed        counsel           eyes         sleepy 
##           0.25           0.25           0.25           0.25           0.25 
##           todd    trumprussia          angry         number          voted 
##           0.25           0.25           0.25           0.25           0.25 
##          didnt 
##           0.24

findAssocs(tdm, "hillary", 0.2)

## $hillary
##     clinton      emails      acting     deleted    colluded    clintons 
##        0.49        0.41        0.39        0.39        0.39        0.39 
##     crooked      crimes        beat         did      andrew      charge 
##        0.36        0.31        0.31        0.31        0.27        0.27 
##        head      mccabe     problem        wife    attorney    sessions 
##        0.27        0.27        0.27        0.27        0.27        0.27 
##       didnt        acid authorities      washed     whereas     council 
##        0.27        0.27        0.27        0.27        0.27        0.27 
##        isnt        boat       choke  obstructed      bernie     sanders 
##        0.27        0.27        0.27        0.27        0.27        0.27 
##      unfair      choked      blames   candidate    facebook     refuses 
##        0.27        0.27        0.27        0.27        0.27        0.27 
## considering     answers   receiving     brother      lifted        paid 
##        0.27        0.27        0.27        0.27        0.27        0.27 
##  connection     coverup      merely    mistakes       based         dnc 
##        0.27        0.27        0.27        0.27        0.27        0.24 
##      asking 
##        0.24

findAssocs(tdm, c("republican","republicans"), 0.2)

## $republican
##    senators  concerning        pass        must       right      solely 
##        0.75        0.38        0.29        0.28        0.26        0.25 
##     anthony     endorse    fairness   primaries  scaramucci       lunch 
##        0.25        0.25        0.25        0.25        0.25        0.25 
##   suffering        thru immediately      unable      unlike   expensive 
##        0.25        0.25        0.25        0.25        0.25        0.25 
##      legacy        burn        easy         not      suffer   certainly 
##        0.25        0.25        0.25        0.25        0.25        0.25 
##       force       nancy           p      please         add     quickly 
##        0.25        0.25        0.25        0.25        0.25        0.25 
##       alike        calm    thanking     replace        hard     working 
##        0.25        0.25        0.25        0.24        0.23        0.22 
##  healthcare 
##        0.21 
## 
## $republicans
##         loyal        return      terrific          most        worked 
##          0.45          0.45          0.45          0.31          0.31 
##     imploding        repeal          dems        chance          work 
##          0.31          0.27          0.25          0.25          0.25 
##      together healthcarethe     obamacare       willing   campaigning 
##          0.24          0.22          0.22          0.22          0.22 
##       greater repercussions    understand  lunchtimethe        scream 
##          0.22          0.22          0.22          0.22          0.22 
##         small     victories         slate   replacement      civilian 
##          0.22          0.22          0.22          0.22          0.22 
##      listened        pushed      convince           won  cutssecurity 
##          0.22          0.22          0.22          0.22          0.22 
## healthcaretax     doddfrank     financial    hensarling           jeb 
##          0.22          0.22          0.22          0.22          0.22 
##    successful       however     subsidies           yet      progress 
##          0.22          0.22          0.22          0.22          0.22 
##       replace 
##          0.20

findAssocs(tdm, "healthcare", 0.2)

## $healthcare
##       plan      death   premiums     spiral   tumbling   approved 
##       0.49       0.39       0.39       0.37       0.37       0.36 
##       cuts  obamacare       bill concerning       fail   senators 
##       0.29       0.27       0.27       0.27       0.25       0.22 
##        tax republican      ocare leadership 
##       0.22       0.21       0.21       0.20

findAssocs(tdm, "obamacare", 0.2)

## $obamacare
##            come            dead          repeal           ideas 
##            0.37            0.34            0.32            0.32 
##            plan         replace      healthcare        disaster 
##            0.32            0.29            0.27            0.27 
##             pen      disastrous            fail        premiums 
##            0.26            0.26            0.26            0.26 
##       imploding obstructionists            save     republicans 
##            0.26            0.24            0.24            0.22 
##       democrats        senators            hand       americans 
##            0.22            0.22            0.21            0.21 
##          failed       insurance            dems 
##            0.21            0.21            0.20

findAssocs(tdm, "russia", 0.2)

## $russia
##            was         crimea           soft          obama          taken 
##           0.45           0.43           0.43           0.41           0.30 
##       meddling        podesta            did           adam         biased 
##           0.25           0.24           0.24           0.21           0.21 
##         schiff         sleazy         spends     television          arent 
##           0.21           0.21           0.21           0.21           0.21 
##    beleaguered     committees         course       hillarys  investigators 
##           0.21           0.21           0.21           0.21           0.21 
##      relations          blame       evidence   ourselvesnot         dollar 
##           0.21           0.21           0.21           0.21           0.21 
##          reset           ties        uranium constructively     negotiated 
##           0.21           0.21           0.21           0.21           0.21 
##      employees         resign        retract       informed       notified 
##           0.21           0.21           0.21           0.21           0.21 
##        advance       november             by        advisor          grand 
##           0.21           0.21           0.21           0.21           0.21 
##       homeland            jeh        johnson        airline         flight 
##           0.21           0.21           0.21           0.21           0.21 
##   humanitarian           plus        reasons       absolute     pertaining 
##           0.21           0.21           0.21           0.21           0.21 
##      scheduled          share        sleeves        brother         lifted 
##           0.21           0.21           0.21           0.21           0.21 
##           paid     associated      nominated           path        fbijust 
##           0.21           0.21           0.21           0.21           0.21

# Topic Modeling
dtm <- as.DocumentTermMatrix(tdm)
library(topicmodels)
lda <- LDA(dtm, k = 8)
term <- terms(lda, 7)
term <- apply(term, MARGIN = 2, paste, collapse = ", ")
term

##                                            Topic 1 
## "president, i, the, amp, great, honor, whitehouse" 
##                                            Topic 2 
##           "amp, i, great, get, media, the, people" 
##                                            Topic 3 
##      "the, we, i, will, obamacare, fake, election" 
##                                            Topic 4 
##     "will, great, i, republicans, good, people, a" 
##                                            Topic 5 
## "will, news, amp, america, american, great, thank" 
##                                            Topic 6 
##             "us, the, amp, i, will, russia, great" 
##                                            Topic 7 
##         "will, great, get, today, the, fake, many" 
##                                            Topic 8 
##                "i, will, amp, big, the, fake, new"

topics <- topics(lda)
topics <- data.frame(date = as.Date(tweets.df$created), topic = topics)
ggplot(topics, aes(date, fill = term[topic])) +
  geom_density(position = "stack")

## Sentiment
library(devtools)
install_github("okugami79/sentiment140")

## Skipping install of 'sentiment' from a github remote, the SHA1 (75be56d6) has not changed since last install.
##   Use `force = TRUE` to force installation

library(sentiment)

## Loading required package: RCurl

## Loading required package: bitops

## Loading required package: rjson

## Loading required package: plyr

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:twitteR':
## 
##     id

sentiments <- sentiment(tweets.df$text)
table(sentiments$polarity)

## 
## negative  neutral positive 
##       55      315      100

sentiments$score <- 0
sentiments$score[sentiments$polarity == "positive"]  <- 1
sentiments$score[sentiments$polarity == "negative"] <- -1
sentiments$date <- as.Date(tweets.df$created)
results <- aggregate(score ~ date, data = sentiments, sum)

ggplot(results, aes(x = date, y = score)) +
  geom_line(stat = "identity")