Visualizing Topic Models

Jason Heppler

Visualizing Topic Models

Taking notes here from Jeri Wieringa’s topic models over time.

The necessary libraries:

{% highlight python %} import gensim from ggplot import * import json import logging import matplotlib as mpl import matplotlib.pyplot as plt import os import pandas as pd import pyLDAvis.gensim import seaborn as sns import warnings {% endhighlight %}

Turn on notebook visualizations:

{% highlight python %} %matplotlib inline pyLDAvis.enable_notebook()

Fix stubborn warnings

warnings.filterwarnings(‘ignore’) pd.options.display.max_rows = 10

base_dir = “” period = ‘1859-to-1875’ directory = “historical_periods” {% endhighlight %}

Start the model:

{% highlight python %} lda_model = gensim.models.LdaModel.load(os.path.join(base_dir, ‘models’, directory, ‘{}.model’.format(period))) corpus = gensim.corpora.MmCorpus(os.path.join(base_dir, ‘corpora’, directory, ‘{}.mm’.format(period))) dictionary = gensim.corpora.Dictionary.load(os.path.join(base_dir, ‘corpora’, directory, ‘{}.dict’.format(period))) {% endhighlight %}

Now, visualize the model:

{% highlight python %} pyLDAviz.gensim.prepare(lda_model, corpus, directory) {% endhighlight %}

Preprocess the model and export information about the weights per document and topic labels to CSV.

{% highlight python %} metadata_filename = os.path.join(base_dir,‘2017-05-Composite-OCR-statistics.csv’) index_filename = os.path.join(base_dir, ‘corpora’, directory, ‘{}.txt’.format(period)) labels_filename = os.path.join(base_dir, ‘dataframes’, directory, ’{}_topicLabels.csv’.format(period)) doc_topic_filename = os.path.join(base_dir, ‘dataframes’, directory, ’{}_dtm.csv’.format(period))

def doc_list(index_filename): “”" Read in from a json document with index position and filename. File was created during the creation of the corpus (.mm) file to document the filename for each file as it was processed.

Returns the index information as a dataframe.
"""
with open(index_filename) as data_file:    
    data = json.load(data_file)
docs = pd.DataFrame.from_dict(data, orient='index').reset_index()
docs.columns = ['index_pos', 'doc_id']
docs['index_pos'] = docs['index_pos'].astype(int)

return docs

def compile_dataframe( index, dtm, labels, metadata): “”" Combines a series of dataframes to create a large composit dataframe. “”" doc2metadata = index.merge(metadata, on=‘doc_id’, how=“left”) topics_expanded = dtm.merge(labels, on=‘topic_id’)

df = topics_expanded.merge(doc2metadata, on="index_pos", how="left")

return df

metadata = pd.read_csv(metadata_filename, usecols=[‘doc_id’, ‘year’,‘title’]) docs_index = doc_list(index_filename) dt = pd.read_csv(doc_topic_filename) labels = pd.read_csv(labels_filename) {% endhighlight %}

Clean up some data and compile the data frame:

{% highlight python %} # Reorient from long to wide dtm = dt.pivot(index=‘index_pos’, columns=‘topic_id’, values=‘topic_weight’).fillna(0)

Divide each value in a row by the sum of the row to normalize the values

dtm = (dtm.T/dtm.sum(axis=1)).T

Shift back to a long dataframe

dt_norm = dtm.stack().reset_index() dt_norm.columns = [‘index_pos’, ‘topic_id’, ‘norm_topic_weight’]

df = compile_dataframe(docs_index, dt_norm, labels, metadata) df {% endhighlight %}