Government Documents Text Analysis
Some notes and code on analyzing government documents.
library(tidyverse) # the One True Package
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggrepel) # repel text labels
## Warning: package 'ggrepel' was built under R version 3.4.4
library(readr) # Importing data
library(tibble) # Better data frames
library(tidytext) # Tidy text mining
## Warning: package 'tidytext' was built under R version 3.4.4
library(broom)
## Warning: package 'broom' was built under R version 3.4.4
library(topicmodels)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.4.4
# Data Prep
# -------------------------------------------
# Read data and convert to dataframe
folder <- "~/Dropbox/dissertation/data/govdocs-ocr/data/"
setwd(folder)
files <- list.files(folder, pattern = "*.txt")
data <- data_frame(filename = files) %>%
mutate(file_contents = map(filename,
~ read_file(file.path(folder, .)))
)
## Warning: package 'bindrcpp' was built under R version 3.4.4
docs <- unnest(data)
# Tokenize
docs_tokens <- docs %>% unnest_tokens(word, file_contents)
docs_tokens
## # A tibble: 10,919,781 x 2
## filename word
## <chr> <chr>
## 1 1881 - History of Santa Clara County-full.txt pr
## 2 1881 - History of Santa Clara County-full.txt eface
## 3 1881 - History of Santa Clara County-full.txt the
## 4 1881 - History of Santa Clara County-full.txt history
## 5 1881 - History of Santa Clara County-full.txt of
## 6 1881 - History of Santa Clara County-full.txt santa
## 7 1881 - History of Santa Clara County-full.txt clara
## 8 1881 - History of Santa Clara County-full.txt county
## 9 1881 - History of Santa Clara County-full.txt was
## 10 1881 - History of Santa Clara County-full.txt undertaken
## # ... with 10,919,771 more rows
# Clear out stopwords
data("stop_words")
cleaned_docs <- docs_tokens %>%
anti_join(stop_words)
## Joining, by = "word"
cleaned_docs %>%
count(word, sort = TRUE)
## # A tibble: 231,475 x 2
## word n
## <chr> <int>
## 1 water 51213
## 2 1 38446
## 3 3 26600
## 4 county 25538
## 5 san 25258
## 6 2 24200
## 7 0 23632
## 8 california 23148
## 9 5 18604
## 10 city 17318
## # ... with 231,465 more rows
cleaned_docs$id <- seq_len(nrow(cleaned_docs))
# Analysis
# -------------------------------------------
# Calculate sentiment
bing <- get_sentiments("bing")
sentiment <- cleaned_docs %>%
inner_join(bing) %>%
count(filename, index = id %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
# Most common positive and negative words
sentiment_word_counts <- cleaned_docs %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
sentiment_word_counts %>%
filter(n > 80) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("Contribution to sentiment")
# Create DTM
word_counts <- cleaned_docs %>%
anti_join(stop_words) %>%
count(id, word, sort = TRUE)
## Joining, by = "word"
docs_dtm <- word_counts %>%
cast_dtm(id, word, n)
## Warning: Trying to compute distinct() for variables not found in the data:
## - `row_col`, `column_col`
## This is an error, but only a warning is raised for compatibility reasons.
## The operation will return the input unchanged.
docs_lda <- LDA(
x = docs_dtm,
k = 16,
method = "Gibbs",
control = list(seed = 7292)
)
tidy_lda <- tidy(docs_lda)
# Top five terms of each topic
top_terms <- tidy_lda %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 160 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 air 0.0154
## 2 1 santa 0.0133
## 3 1 0 0.0129
## 4 1 lake 0.0111
## 5 1 construction 0.0111
## 6 1 project 0.0101
## 7 1 information 0.00965
## 8 1 river 0.00947
## 9 1 land 0.00859
## 10 1 table 0.00815
## # ... with 150 more rows
# Graph the top terms
ggplot(top_terms, aes(term, beta, fill = as.factor(topic))) +
geom_bar(stat = "identity", show.legend=FALSE, alpha = 0.8) +
coord_flip() +
labs(title = "Top 10 Terms in Each LDA Topic",
subtitle = "Topic modeling Silicon Valley city planning documents",
caption = "Jason A. Heppler",
x = NULL, y = "beta") +
facet_wrap(~topic, ncol = 4, scales = "free") +
theme_tufte(base_family = "Fira Sans", ticks = FALSE) +
scale_y_continuous(expand=c(0,0)) +
theme(strip.text = element_text(hjust = 0)) +
theme(plot.caption = element_text(size = 9))
# Distributed probabilities
lda_gamma <- tidy(docs_lda, matrix = "gamma")
ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
geom_histogram(show.legend = FALSE, alpha = 0.8) +
facet_wrap(~topic, ncol = 4) +
labs(title = "Distribution of Probability for Each Topic",
subtitle = "Topic modeling government documents",
caption = "Jason A. Heppler",
y = NULL, x = "gamma") +
scale_y_log10() +
theme_minimal(base_family = "Lato", base_size = 13) +
theme(strip.text=element_text(hjust=0)) +
theme(plot.caption=element_text(size=9))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 448 rows containing missing values (geom_bar).