Anthony Jimenez
20 August 2021
http://www.fa-jimenez.com/
With data visualization becoming more and more important in order to share results and help give meaning to certain projects, this notebook will leverage the plotly library for animation chart generation.
Main Data Sources:
With US Census data being provided every 10 years, a simple numpy library function (np.interp()) will be used to linearly interpolate between the provided data points to give more density when generating our animation bar chart.
Finally, before plotting, we will generate a function that will allow us to filter our dataframe to show only the top 10 US State Populations for each given year.
import pandas as pd
import numpy as np
import plotly.express as px
# Read in data from github
xls_file = r'https://github.com/ajmz1/Data-Visualizations/blob/main/us_census_data_by_state.xlsx?raw=true'
xls = pd.read_excel(xls_file, sheet_name=None)
# Configure dataframe from 1790-1860
df1_wide = xls['1790-1860']
df1_wide.drop('Admitted[b]', axis=1, inplace=True)
df1 = df1_wide.melt(id_vars=['Name'])
# Configure dataframe from 1870-1950
df2_wide = xls['1870-1950']
df2 = df2_wide.melt(id_vars=['Name'])
# Configure dataframe from 1960-2020
df3_wide = xls['1960-2020']
df3 = df3_wide.melt(id_vars=['Name'])
df3
Name | variable | value | |
---|---|---|---|
0 | Alabama | 1960 | 3266740.0 |
1 | Alaska | 1960 | 226167.0 |
2 | American Samoa | 1960 | 20051.0 |
3 | Arizona | 1960 | 1302161.0 |
4 | Arkansas | 1960 | 1786272.0 |
... | ... | ... | ... |
475 | Washington | 2020 | 7705281.0 |
476 | West Virginia | 2020 | 1793716.0 |
477 | Wisconsin | 2020 | 5893718.0 |
478 | Wyoming | 2020 | 576851.0 |
479 | United States | 2020 | NaN |
480 rows × 3 columns
df = df1.append([df2, df3])
df
Name | variable | value | |
---|---|---|---|
0 | Alabama | 1790 | NaN |
1 | Alaska | 1790 | NaN |
2 | American Samoa | 1790 | NaN |
3 | Arizona | 1790 | NaN |
4 | Arkansas | 1790 | NaN |
... | ... | ... | ... |
475 | Washington | 2020 | 7705281.0 |
476 | West Virginia | 2020 | 1793716.0 |
477 | Wisconsin | 2020 | 5893718.0 |
478 | Wyoming | 2020 | 576851.0 |
479 | United States | 2020 | NaN |
1449 rows × 3 columns
df_format = df.groupby(['Name', 'variable'])['value'].sum()
df_format
Name variable Alabama 1790 0.0 1800 1250.0 1810 9046.0 1820 127901.0 1830 309527.0 ... Wyoming 1990 453588.0 2000 493782.0 2010 563626.0 2015 600000.0 2020 576851.0 Name: value, Length: 1449, dtype: float64
# Main function for creating dense population dataframes based off of US Census data
def interpolate_year_data(df, n_interp):
# Define the interpolation data points we seek
time_interp = np.linspace(1790, 2020, n_interp)
# Main loop to interpolate and populate matrix for future display
for idx, val in enumerate(df.index.get_level_values(0).unique()):
# Get the table data for x- and y-coordinates so we can interpolate
xp = df[(df.index.get_level_values(0) == val)].index.get_level_values(1)
fp = df[(df.index.get_level_values(0) == val)]
label = n_interp * [val]
# Interpolate the data table
pop_interp = np.interp(time_interp, xp, fp)
# Prepare dataframe for output of the function
df_temp = pd.DataFrame(data=[time_interp, label, pop_interp]).T
df_temp.columns = ['Year', 'State', 'Population']
if idx == 0:
df_out = df_temp
else:
df_out = df_out.append([df_temp])
# Reset the index just for ease of reading
df_out.reset_index(drop=True, inplace=True)
return df_out
# Filter the data to show only the top "X" states by population in preparation for chart animation
def return_top_states(df, n_top):
# Filter out the US population rows
df = df[df.State != 'United States']
# Main loop for returning the top "X" states for each year
for idx, val in enumerate(df.Year.unique()):
df_top = df[df['Year'] == val].sort_values(by='Population', ascending=False)[:n_top]
if idx == 0:
df_out = df_top
else:
df_out = df_out.append([df_top])
# Format the columns before outputting
df_out['Year'] = df_out['Year'].astype(int)
df_out['Population'] = df_out['Population'].astype(float)
df_out['Population'] = df_out['Population'].round(decimals=0)
df_out.reset_index(drop=True, inplace=True)
return df_out
# Function call to interpolate data
df_dense = interpolate_year_data(df_format, 30)
# Final function to return the top 10 states for each year
df_out = return_top_states(df_dense, 10)
# Make animation frame
fig = px.bar(x=df_out.Population, y=df_out.State, orientation='h', animation_frame=df_out.Year, color=df_out.State)
# Do not show legend
fig.update_layout(showlegend=False, title_text=f'Evolution of State Population from 1790 - 2020',
yaxis_title='', xaxis_title='')
# Sort labels by greatest population value
fig.update_yaxes(categoryorder='total ascending')
# Adjust the transition speed
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 1000
fig.show()