Government Documents Text Analysis

Some notes and code on analyzing government documents.

library(tidyverse)   # the One True Package
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'tidyr' was built under R version 3.4.4
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggrepel)     # repel text labels
## Warning: package 'ggrepel' was built under R version 3.4.4
library(readr)       # Importing data
library(tibble)      # Better data frames
library(tidytext)    # Tidy text mining
## Warning: package 'tidytext' was built under R version 3.4.4
library(broom)
## Warning: package 'broom' was built under R version 3.4.4
library(topicmodels)
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.4.4
# Data Prep
# -------------------------------------------

# Read data and convert to dataframe
folder <- "~/Dropbox/dissertation/data/govdocs-ocr/data/"
setwd(folder)

files <- list.files(folder, pattern = "*.txt")
data <- data_frame(filename = files) %>% 
  mutate(file_contents = map(filename,
                             ~ read_file(file.path(folder, .)))
  )
## Warning: package 'bindrcpp' was built under R version 3.4.4
docs <- unnest(data)

# Tokenize
docs_tokens <- docs %>% unnest_tokens(word, file_contents)
docs_tokens
## # A tibble: 10,919,781 x 2
##    filename                                      word      
##    <chr>                                         <chr>     
##  1 1881 - History of Santa Clara County-full.txt pr        
##  2 1881 - History of Santa Clara County-full.txt eface     
##  3 1881 - History of Santa Clara County-full.txt the       
##  4 1881 - History of Santa Clara County-full.txt history   
##  5 1881 - History of Santa Clara County-full.txt of        
##  6 1881 - History of Santa Clara County-full.txt santa     
##  7 1881 - History of Santa Clara County-full.txt clara     
##  8 1881 - History of Santa Clara County-full.txt county    
##  9 1881 - History of Santa Clara County-full.txt was       
## 10 1881 - History of Santa Clara County-full.txt undertaken
## # ... with 10,919,771 more rows
# Clear out stopwords
data("stop_words")
cleaned_docs <- docs_tokens %>% 
  anti_join(stop_words)
## Joining, by = "word"
cleaned_docs %>% 
  count(word, sort = TRUE)
## # A tibble: 231,475 x 2
##    word           n
##    <chr>      <int>
##  1 water      51213
##  2 1          38446
##  3 3          26600
##  4 county     25538
##  5 san        25258
##  6 2          24200
##  7 0          23632
##  8 california 23148
##  9 5          18604
## 10 city       17318
## # ... with 231,465 more rows
cleaned_docs$id <- seq_len(nrow(cleaned_docs))

# Analysis
# -------------------------------------------

# Calculate sentiment
bing <- get_sentiments("bing")
sentiment <- cleaned_docs %>% 
  inner_join(bing) %>% 
  count(filename, index = id %/% 80, sentiment) %>% 
  spread(sentiment, n, fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
# Most common positive and negative words
sentiment_word_counts <- cleaned_docs %>%  
  inner_join(bing) %>% 
  count(word, sentiment, sort = TRUE) %>% 
  ungroup()
## Joining, by = "word"
sentiment_word_counts %>% 
  filter(n > 80) %>% 
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("Contribution to sentiment")

# Create DTM
word_counts <- cleaned_docs %>% 
  anti_join(stop_words) %>% 
  count(id, word, sort = TRUE)
## Joining, by = "word"
docs_dtm <- word_counts %>% 
  cast_dtm(id, word, n)
## Warning: Trying to compute distinct() for variables not found in the data:
## - `row_col`, `column_col`
## This is an error, but only a warning is raised for compatibility reasons.
## The operation will return the input unchanged.
docs_lda <- LDA(
  x = docs_dtm,
  k = 16,
  method = "Gibbs",
  control = list(seed = 7292)
)

tidy_lda <- tidy(docs_lda)

# Top five terms of each topic
top_terms <- tidy_lda %>%  
  group_by(topic) %>% 
  top_n(10, beta) %>% 
  ungroup() %>% 
  arrange(topic, -beta)
top_terms
## # A tibble: 160 x 3
##    topic term            beta
##    <int> <chr>          <dbl>
##  1     1 air          0.0154 
##  2     1 santa        0.0133 
##  3     1 0            0.0129 
##  4     1 lake         0.0111 
##  5     1 construction 0.0111 
##  6     1 project      0.0101 
##  7     1 information  0.00965
##  8     1 river        0.00947
##  9     1 land         0.00859
## 10     1 table        0.00815
## # ... with 150 more rows
# Graph the top terms
ggplot(top_terms, aes(term, beta, fill = as.factor(topic))) +
  geom_bar(stat = "identity", show.legend=FALSE, alpha = 0.8) +
  coord_flip() +
  labs(title = "Top 10 Terms in Each LDA Topic",
       subtitle = "Topic modeling Silicon Valley city planning documents",
       caption = "Jason A. Heppler",
       x = NULL, y = "beta") +
  facet_wrap(~topic, ncol = 4, scales = "free") +
  theme_tufte(base_family = "Fira Sans", ticks = FALSE) +
  scale_y_continuous(expand=c(0,0)) +
  theme(strip.text = element_text(hjust = 0)) +
  theme(plot.caption = element_text(size = 9))

# Distributed probabilities
lda_gamma <- tidy(docs_lda, matrix = "gamma")

ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE, alpha = 0.8) +
  facet_wrap(~topic, ncol = 4) +
  labs(title = "Distribution of Probability for Each Topic",
       subtitle = "Topic modeling government documents",
       caption = "Jason A. Heppler",
       y = NULL, x = "gamma") +
  scale_y_log10() +
  theme_minimal(base_family = "Lato", base_size = 13) +
  theme(strip.text=element_text(hjust=0)) +
  theme(plot.caption=element_text(size=9))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 448 rows containing missing values (geom_bar).