Government Documents Text Analysis

Some notes and code on analyzing government documents.

library(tidyverse)   # the One True Package
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(ggrepel)     # repel text labels
library(readr)       # Importing data
library(tibble)      # Better data frames
library(tidytext)    # Tidy text mining
library(broom)
library(topicmodels)
library(ggthemes)
# Data Prep
# -------------------------------------------

# Read data and convert to dataframe
folder <- "~/Dropbox/dissertation/data/govdocs-ocr/data/"
setwd(folder)

files <- list.files(folder, pattern = "*.txt")
data <- data_frame(filename = files) %>% 
  mutate(file_contents = map(filename,
                             ~ read_file(file.path(folder, .)))
  )
docs <- unnest(data)

# Tokenize
docs_tokens <- docs %>% unnest_tokens(word, file_contents)
docs_tokens
## # A tibble: 1,957,082 x 2
##                                            filename       word
##                                               <chr>      <chr>
##  1 1946_Collection_Treatment-full-unclean.clean.txt      santa
##  2 1946_Collection_Treatment-full-unclean.clean.txt      clara
##  3 1946_Collection_Treatment-full-unclean.clean.txt     county
##  4 1946_Collection_Treatment-full-unclean.clean.txt     sewage
##  5 1946_Collection_Treatment-full-unclean.clean.txt   disposal
##  6 1946_Collection_Treatment-full-unclean.clean.txt     survey
##  7 1946_Collection_Treatment-full-unclean.clean.txt     report
##  8 1946_Collection_Treatment-full-unclean.clean.txt       upon
##  9 1946_Collection_Treatment-full-unclean.clean.txt        the
## 10 1946_Collection_Treatment-full-unclean.clean.txt collection
## # ... with 1,957,072 more rows
# Clear out stopwords
data("stop_words")
cleaned_docs <- docs_tokens %>% 
  anti_join(stop_words)
## Joining, by = "word"
cleaned_docs %>% 
  count(word, sort = TRUE)
## # A tibble: 72,830 x 2
##          word     n
##         <chr> <int>
##  1      water 13704
##  2          1  7076
##  3        000  6725
##  4 california  6657
##  5        bay  6253
##  6      river  5409
##  7          3  5159
##  8     county  4804
##  9          2  4778
## 10      creek  4528
## # ... with 72,820 more rows
cleaned_docs$id <- seq_len(nrow(cleaned_docs))

# Analysis
# -------------------------------------------

# Calculate sentiment
bing <- get_sentiments("bing")
sentiment <- cleaned_docs %>% 
  inner_join(bing) %>% 
  count(filename, index = id %/% 80, sentiment) %>% 
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
# Most common positive and negative words
sentiment_word_counts <- cleaned_docs %>%  
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  ungroup()
## Joining, by = "word"
sentiment_word_counts %>% 
  filter(n > 80) %>% 
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("Contribution to sentiment")

# Create DTM
word_counts <- cleaned_docs %>% 
  anti_join(stop_words) %>% 
  count(id, word, sort = TRUE)
## Joining, by = "word"
docs_dtm <- word_counts %>% 
  cast_dtm(id, word, n)

docs_lda <- LDA(
  x = docs_dtm,
  k = 16,
  method = "Gibbs",
  control = list(seed = 7292)
)

tidy_lda <- tidy(docs_lda)

# Top five terms of each topic
top_terms <- tidy_lda %>%  
  group_by(topic) %>% 
  top_n(10, beta) %>% 
  ungroup() %>% 
  arrange(topic, -beta)
top_terms
## # A tibble: 161 x 3
##    topic       term        beta
##    <int>      <chr>       <dbl>
##  1     1 california 0.023916794
##  2     1      lands 0.022780638
##  3     1      river 0.021307842
##  4     1      santa 0.020143633
##  5     1    million 0.012583283
##  6     1  reservoir 0.010114598
##  7     1       cost 0.009188840
##  8     1      canal 0.009118707
##  9     1    percent 0.007435513
## 10     1   business 0.007421486
## # ... with 151 more rows
# Graph the top terms
ggplot(top_terms, aes(term, beta, fill = as.factor(topic))) +
  geom_bar(stat = "identity", show.legend=FALSE, alpha = 0.8) +
  coord_flip() +
  labs(title = "Top 10 Terms in Each LDA Topic",
       subtitle = "Topic modeling Silicon Valley city planning documents",
       caption = "Jason A. Heppler",
       x = NULL, y = "beta") +
  facet_wrap(~topic, ncol = 4, scales = "free") +
  theme_tufte(base_family = "Fira Sans", ticks = FALSE) +
  scale_y_continuous(expand=c(0,0)) +
  theme(strip.text = element_text(hjust = 0)) +
  theme(plot.caption = element_text(size = 9))

# Distributed probabilities
lda_gamma <- tidy(docs_lda, matrix = "gamma")

ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE, alpha = 0.8) +
  facet_wrap(~topic, ncol = 4) +
  labs(title = "Distribution of Probability for Each Topic",
       subtitle = "Topic modeling government documents",
       caption = "Jason A. Heppler",
       y = NULL, x = "gamma") +
  scale_y_log10() +
  theme_minimal(base_family = "Lato", base_size = 13) +
  theme(strip.text=element_text(hjust=0)) +
  theme(plot.caption=element_text(size=9))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 448 rows containing missing values (geom_bar).