import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
import seaborn as sns
def load_data(path:str="/"):
"""
Loads the specified data using the path
Args
-------
path: str (where president_timelines.csv and president_states.csv reside)
Returns
--------
The combined dataframe
"""
df_time = pd.read_csv("president_timelines.csv")
df_states = pd.read_csv("president_states.csv")
# Setting columns as specified by the dataframe
df_time.columns = ["Index", "Name", "Birth", "Death", "TermBegin", "TermEnd"]
df_states.columns = ["Name", "Birth State"]
president_names = df_time.Name
print(f"Total number of presidents: {len(president_names)}")
president_states = df_states["Birth State"]
print(f"Total states: {len(president_states)}")
df = pd.DataFrame({"name":president_names, "states":president_states})
# changing type
df.name = df.name.astype(str)
df.states = df.states.astype(str)
# removing whitespaces
df['states'] = df['states'].apply(lambda x: x.strip())
# removing quotes
df.states = df.states.apply(lambda x: x.replace('"',""))
df.name = df.name.apply(lambda x: x.replace('"',""))
return df
# loading data
df = load_data()
df.head()
Total number of presidents: 45 Total states: 45
name | states | |
---|---|---|
0 | George Washington | Virginia |
1 | John Adams | Massachusetts |
2 | Thomas Jefferson | Virginia |
3 | James Madison | Virginia |
4 | James Monroe | Virginia |
def common(show_counter=False, show_prez=False):
"""
Find the common data points in the dataset
"""
from collections import Counter
mapping_state_name = {state:name for state,name in zip(df['states'].unique(), df['name'].unique())}
df['states'] = df['states']
counter = Counter(df['states'].values)
counter_vice_versa = {counter[key]: key for key in counter.keys()}
max_counter = max(counter)
count = sorted(counter.values())
second_largest_count = count[-2]
print(f"Maximum occured state: {max_counter}.")
print(f"Second largest state: {counter_vice_versa[second_largest_count]}")
print(f"Third largest state: {counter_vice_versa[count[-3]]}")
print(f"Fourth largest state: {counter_vice_versa[count[-4]]}")
print(f"Fifth largest state: {counter_vice_versa[count[-5]]}")
# presidents with the max_counter
if show_prez:
for name, state in zip(df['name'], df['states']):
if state == max_counter:
print(f"President with {max_counter} was {name}.")
if show_counter:
print(counter.keys())
common(False, True)
Maximum occured state: Virginia. Second largest state: Ohio Third largest state: New York Fourth largest state: Massachusetts Fifth largest state: Texas President with Virginia was George Washington. President with Virginia was Thomas Jefferson. President with Virginia was James Madison. President with Virginia was James Monroe. President with Virginia was William Henry Harrison. President with Virginia was John Tyler. President with Virginia was Zachary Taylor. President with Virginia was Woodrow Wilson.
matrix_based = df.groupby("name")['states'].value_counts().unstack().fillna(0)
sns.heatmap(matrix_based, vmax=1, vmin=0, cmap="gray")
plt.title("Finding common ground")
Text(0.5, 1.0, 'Finding common ground')
def load_data_from_cdc(save=False):
"""
Load data from CDC
"""
# reading the html table at index 0
df_states_fact = pd.read_html("https://www.cdc.gov/nchs/fastats/state-and-territorial-data.htm")[0]
# changing column (for join)
df_states_fact['states'] = df_states_fact['State/Territory']
# removing old column
df_states_fact.drop(columns=['State/Territory'], inplace=True)
merged = pd.merge(df, df_states_fact)
# Save the csv
if save:
merged.to_csv(index=False)
merged.Births = merged.Births.astype(float)
merged['Fertility Rate'] = merged['Fertility Rate'].astype(float)
merged['Death Rate'] = merged['Death Rate'].astype(float)
return merged
df_merged = load_data_from_cdc()
df_merged.corr()
Births | Fertility Rate | Death Rate | |
---|---|---|---|
Births | 1.000000 | 0.226813 | -0.485603 |
Fertility Rate | 0.226813 | 1.000000 | 0.109002 |
Death Rate | -0.485603 | 0.109002 | 1.000000 |
def apply_transformations(df: pd.DataFrame):
"""
Apply transformation on states and name
"""
# mapping of states and names
mapping_states = {state: index for index, state in enumerate(df['states'])}
mapping_name = {name: index for index, name in enumerate(df['name'])}
df.states = df.states.apply(lambda x: mapping_states[x])
df.name = df.name.apply(lambda x: mapping_name[x])
return df
df_transformed = apply_transformations(df_merged)
df_transformed.head()
name | states | Births | Fertility Rate | Deaths | Death Rate | |
---|---|---|---|---|---|---|
0 | 0 | 7 | 99843.0 | 59.1 | 68579 | 809.7 |
1 | 1 | 7 | 99843.0 | 59.1 | 68579 | 809.7 |
2 | 2 | 7 | 99843.0 | 59.1 | 68579 | 809.7 |
3 | 3 | 7 | 99843.0 | 59.1 | 68579 | 809.7 |
4 | 4 | 7 | 99843.0 | 59.1 | 68579 | 809.7 |
df_transformed.corr()
name | states | Births | Fertility Rate | Death Rate | |
---|---|---|---|---|---|
name | 1.000000 | 0.990701 | 0.093970 | 0.275955 | 0.196630 |
states | 0.990701 | 1.000000 | 0.109458 | 0.307867 | 0.212296 |
Births | 0.093970 | 0.109458 | 1.000000 | 0.226813 | -0.485603 |
Fertility Rate | 0.275955 | 0.307867 | 0.226813 | 1.000000 | 0.109002 |
Death Rate | 0.196630 | 0.212296 | -0.485603 | 0.109002 | 1.000000 |
df_transformed.corr()[df_transformed.corr() > 0.5].fillna(0)
name | states | Births | Fertility Rate | Death Rate | |
---|---|---|---|---|---|
name | 1.000000 | 0.990701 | 0.0 | 0.0 | 0.0 |
states | 0.990701 | 1.000000 | 0.0 | 0.0 | 0.0 |
Births | 0.000000 | 0.000000 | 1.0 | 0.0 | 0.0 |
Fertility Rate | 0.000000 | 0.000000 | 0.0 | 1.0 | 0.0 |
Death Rate | 0.000000 | 0.000000 | 0.0 | 0.0 | 1.0 |
sns.regplot("Births", "Death Rate", data=df_transformed)
<AxesSubplot:xlabel='Births', ylabel='Death Rate'>
sns.lmplot("Fertility Rate", "Death Rate", data=df_transformed)
<seaborn.axisgrid.FacetGrid at 0x7f9650b681f0>
df_latlong = pd.read_csv("statelatlong.csv")
df_latlong['states'] = df_latlong['City']
df_latlong.drop(columns=['City'], inplace=True)
df_final = pd.merge(df_merged, df_latlong)
df_final.head()
name | states | Births | Fertility Rate | Deaths | Death Rate | State | Latitude | Longitude | |
---|---|---|---|---|---|---|---|---|---|
0 | George Washington | Virginia | 99843.0 | 59.1 | 68579 | 809.7 | VA | 38.003386 | -79.458786 |
1 | Thomas Jefferson | Virginia | 99843.0 | 59.1 | 68579 | 809.7 | VA | 38.003386 | -79.458786 |
2 | James Madison | Virginia | 99843.0 | 59.1 | 68579 | 809.7 | VA | 38.003386 | -79.458786 |
3 | James Monroe | Virginia | 99843.0 | 59.1 | 68579 | 809.7 | VA | 38.003386 | -79.458786 |
4 | William Henry Harrison | Virginia | 99843.0 | 59.1 | 68579 | 809.7 | VA | 38.003386 | -79.458786 |
map_osm = folium.Map(width=500,height=500,location=[37.0902, -95.7129], zoom_start=3)
df_final.apply(lambda row:folium.CircleMarker(location=[row["Latitude"],
row["Longitude"]]).add_to(map_osm),
axis=1)
0 <folium.vector_layers.CircleMarker object at 0... 1 <folium.vector_layers.CircleMarker object at 0... 2 <folium.vector_layers.CircleMarker object at 0... 3 <folium.vector_layers.CircleMarker object at 0... 4 <folium.vector_layers.CircleMarker object at 0... 5 <folium.vector_layers.CircleMarker object at 0... 6 <folium.vector_layers.CircleMarker object at 0... 7 <folium.vector_layers.CircleMarker object at 0... 8 <folium.vector_layers.CircleMarker object at 0... 9 <folium.vector_layers.CircleMarker object at 0... 10 <folium.vector_layers.CircleMarker object at 0... 11 <folium.vector_layers.CircleMarker object at 0... 12 <folium.vector_layers.CircleMarker object at 0... 13 <folium.vector_layers.CircleMarker object at 0... 14 <folium.vector_layers.CircleMarker object at 0... 15 <folium.vector_layers.CircleMarker object at 0... 16 <folium.vector_layers.CircleMarker object at 0... 17 <folium.vector_layers.CircleMarker object at 0... 18 <folium.vector_layers.CircleMarker object at 0... 19 <folium.vector_layers.CircleMarker object at 0... 20 <folium.vector_layers.CircleMarker object at 0... 21 <folium.vector_layers.CircleMarker object at 0... 22 <folium.vector_layers.CircleMarker object at 0... 23 <folium.vector_layers.CircleMarker object at 0... 24 <folium.vector_layers.CircleMarker object at 0... 25 <folium.vector_layers.CircleMarker object at 0... 26 <folium.vector_layers.CircleMarker object at 0... 27 <folium.vector_layers.CircleMarker object at 0... 28 <folium.vector_layers.CircleMarker object at 0... 29 <folium.vector_layers.CircleMarker object at 0... 30 <folium.vector_layers.CircleMarker object at 0... 31 <folium.vector_layers.CircleMarker object at 0... 32 <folium.vector_layers.CircleMarker object at 0... 33 <folium.vector_layers.CircleMarker object at 0... 34 <folium.vector_layers.CircleMarker object at 0... 35 <folium.vector_layers.CircleMarker object at 0... 36 <folium.vector_layers.CircleMarker object at 0... 37 <folium.vector_layers.CircleMarker object at 0... 38 <folium.vector_layers.CircleMarker object at 0... 39 <folium.vector_layers.CircleMarker object at 0... 40 <folium.vector_layers.CircleMarker object at 0... 41 <folium.vector_layers.CircleMarker object at 0... 42 <folium.vector_layers.CircleMarker object at 0... 43 <folium.vector_layers.CircleMarker object at 0... 44 <folium.vector_layers.CircleMarker object at 0... dtype: object
map_osm