Trump Tweets Text Analysis
Analyzing Trump tweets since January.
library(twitteR)
library(ROAuth)
# Twitter setup
setup_twitter_oauth(consumer_key = ,
consumer_secret = ,
access_token = ,
access_secret = )
## [1] "Using direct authentication"
# Grab tweets and convert into dataframe
tweets <- userTimeline("realdonaldtrump", n = 3200)
tweets.df <- twListToDF(tweets)
# Text cleaning
library(tm)
## Loading required package: NLP
corpus <- Corpus(VectorSource(tweets.df$text))
#corpus <- tm_map(corpus, content_transformer(tolower))
removeurl <- function(x) gsub("http[^[:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removeurl))
removenumpunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
corpus <- tm_map(corpus, content_transformer(removenumpunct))
mystopwords <- c(setdiff(stopwords('english'), c("r")),
"the", "amp")
corpus <- tm_map(corpus, removeWords, stopwords('english'))
corpus <- tm_map(corpus, stripWhitespace)
# Term document matrix
tdm <- TermDocumentMatrix(corpus,
control = list(wordLengths = c(1, Inf)))
tdm
## <<TermDocumentMatrix (terms: 2062, documents: 470)>>
## Non-/sparse entries: 5679/963461
## Sparsity : 99%
## Maximal term length: 26
## Weighting : term frequency (tf)
idx <- which(dimnames(tdm)$Terms %in% c("obama", "hillary"))
as.matrix(tdm[idx, 1:20])
## Docs
## Terms 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## hillary 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
## obama 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
# Top frequenty terms
freq.terms <- findFreqTerms(tdm, lowfreq = 20)
freq.terms
## [1] "big" "great" "will" "people" "amp"
## [6] "the" "us" "american" "thank" "democrats"
## [11] "i" "obamacare" "healthcare" "fake" "news"
## [16] "many" "america" "russia" "time" "just"
## [21] "new" "president" "election" "it" "media"
## [26] "today" "get" "we"
term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 20)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
ggplot(df, aes(x = term, y = freq)) +
geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip() +
theme(axis.text = element_text(size = 7))
findAssocs(tdm, "obama", 0.2)
## $obama
## administration meddling crimea soft russia
## 0.57 0.50 0.50 0.50 0.41
## took taken was place nothing
## 0.37 0.35 0.35 0.32 0.30
## why election with agencies bombshell
## 0.29 0.27 0.25 0.25 0.25
## check dismissed from rig informed
## 0.25 0.25 0.25 0.25 0.25
## notified choked told advance november
## 0.25 0.25 0.25 0.25 0.25
## by appointed counsel eyes sleepy
## 0.25 0.25 0.25 0.25 0.25
## todd trumprussia angry number voted
## 0.25 0.25 0.25 0.25 0.25
## didnt
## 0.24
findAssocs(tdm, "hillary", 0.2)
## $hillary
## clinton emails acting deleted colluded clintons
## 0.49 0.41 0.39 0.39 0.39 0.39
## crooked crimes beat did andrew charge
## 0.36 0.31 0.31 0.31 0.27 0.27
## head mccabe problem wife attorney sessions
## 0.27 0.27 0.27 0.27 0.27 0.27
## didnt acid authorities washed whereas council
## 0.27 0.27 0.27 0.27 0.27 0.27
## isnt boat choke obstructed bernie sanders
## 0.27 0.27 0.27 0.27 0.27 0.27
## unfair choked blames candidate facebook refuses
## 0.27 0.27 0.27 0.27 0.27 0.27
## considering answers receiving brother lifted paid
## 0.27 0.27 0.27 0.27 0.27 0.27
## connection coverup merely mistakes based dnc
## 0.27 0.27 0.27 0.27 0.27 0.24
## asking
## 0.24
findAssocs(tdm, c("republican","republicans"), 0.2)
## $republican
## senators concerning pass must right solely
## 0.75 0.38 0.29 0.28 0.26 0.25
## anthony endorse fairness primaries scaramucci lunch
## 0.25 0.25 0.25 0.25 0.25 0.25
## suffering thru immediately unable unlike expensive
## 0.25 0.25 0.25 0.25 0.25 0.25
## legacy burn easy not suffer certainly
## 0.25 0.25 0.25 0.25 0.25 0.25
## force nancy p please add quickly
## 0.25 0.25 0.25 0.25 0.25 0.25
## alike calm thanking replace hard working
## 0.25 0.25 0.25 0.24 0.23 0.22
## healthcare
## 0.21
##
## $republicans
## loyal return terrific most worked
## 0.45 0.45 0.45 0.31 0.31
## imploding repeal dems chance work
## 0.31 0.27 0.25 0.25 0.25
## together healthcarethe obamacare willing campaigning
## 0.24 0.22 0.22 0.22 0.22
## greater repercussions understand lunchtimethe scream
## 0.22 0.22 0.22 0.22 0.22
## small victories slate replacement civilian
## 0.22 0.22 0.22 0.22 0.22
## listened pushed convince won cutssecurity
## 0.22 0.22 0.22 0.22 0.22
## healthcaretax doddfrank financial hensarling jeb
## 0.22 0.22 0.22 0.22 0.22
## successful however subsidies yet progress
## 0.22 0.22 0.22 0.22 0.22
## replace
## 0.20
findAssocs(tdm, "healthcare", 0.2)
## $healthcare
## plan death premiums spiral tumbling approved
## 0.49 0.39 0.39 0.37 0.37 0.36
## cuts obamacare bill concerning fail senators
## 0.29 0.27 0.27 0.27 0.25 0.22
## tax republican ocare leadership
## 0.22 0.21 0.21 0.20
findAssocs(tdm, "obamacare", 0.2)
## $obamacare
## come dead repeal ideas
## 0.37 0.34 0.32 0.32
## plan replace healthcare disaster
## 0.32 0.29 0.27 0.27
## pen disastrous fail premiums
## 0.26 0.26 0.26 0.26
## imploding obstructionists save republicans
## 0.26 0.24 0.24 0.22
## democrats senators hand americans
## 0.22 0.22 0.21 0.21
## failed insurance dems
## 0.21 0.21 0.20
findAssocs(tdm, "russia", 0.2)
## $russia
## was crimea soft obama taken
## 0.45 0.43 0.43 0.41 0.30
## meddling podesta did adam biased
## 0.25 0.24 0.24 0.21 0.21
## schiff sleazy spends television arent
## 0.21 0.21 0.21 0.21 0.21
## beleaguered committees course hillarys investigators
## 0.21 0.21 0.21 0.21 0.21
## relations blame evidence ourselvesnot dollar
## 0.21 0.21 0.21 0.21 0.21
## reset ties uranium constructively negotiated
## 0.21 0.21 0.21 0.21 0.21
## employees resign retract informed notified
## 0.21 0.21 0.21 0.21 0.21
## advance november by advisor grand
## 0.21 0.21 0.21 0.21 0.21
## homeland jeh johnson airline flight
## 0.21 0.21 0.21 0.21 0.21
## humanitarian plus reasons absolute pertaining
## 0.21 0.21 0.21 0.21 0.21
## scheduled share sleeves brother lifted
## 0.21 0.21 0.21 0.21 0.21
## paid associated nominated path fbijust
## 0.21 0.21 0.21 0.21 0.21
# Topic Modeling
dtm <- as.DocumentTermMatrix(tdm)
library(topicmodels)
lda <- LDA(dtm, k = 8)
term <- terms(lda, 7)
term <- apply(term, MARGIN = 2, paste, collapse = ", ")
term
## Topic 1
## "president, i, the, amp, great, honor, whitehouse"
## Topic 2
## "amp, i, great, get, media, the, people"
## Topic 3
## "the, we, i, will, obamacare, fake, election"
## Topic 4
## "will, great, i, republicans, good, people, a"
## Topic 5
## "will, news, amp, america, american, great, thank"
## Topic 6
## "us, the, amp, i, will, russia, great"
## Topic 7
## "will, great, get, today, the, fake, many"
## Topic 8
## "i, will, amp, big, the, fake, new"
topics <- topics(lda)
topics <- data.frame(date = as.Date(tweets.df$created), topic = topics)
ggplot(topics, aes(date, fill = term[topic])) +
geom_density(position = "stack")
## Sentiment
library(devtools)
install_github("okugami79/sentiment140")
## Skipping install of 'sentiment' from a github remote, the SHA1 (75be56d6) has not changed since last install.
## Use `force = TRUE` to force installation
library(sentiment)
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: rjson
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following object is masked from 'package:twitteR':
##
## id
sentiments <- sentiment(tweets.df$text)
table(sentiments$polarity)
##
## negative neutral positive
## 55 315 100
sentiments$score <- 0
sentiments$score[sentiments$polarity == "positive"] <- 1
sentiments$score[sentiments$polarity == "negative"] <- -1
sentiments$date <- as.Date(tweets.df$created)
results <- aggregate(score ~ date, data = sentiments, sum)
ggplot(results, aes(x = date, y = score)) +
geom_line(stat = "identity")