More data visualization with seaborn
In my earlier blog post related to Data visualization with Seaborn, we have gone through our learning journey for Data Science with python through Datacamp. This blog is kind of continuity for more data visualization with seaborn. So lets start.
You can refer my kaggle notebook and github for futher details
Seaborn elaboration
Comparing a histogram and distplot
grant_file='schoolimprovement2010grants.csv'
# Read in the DataFrame
df = pd.read_csv(grant_file)
print(df.head())
# Read in the DataFrame
df = pd.read_csv(grant_file)
print(df.head())
# Display pandas histogram
df['Award_Amount'].plot.hist()
plt.show()
# Clear out the pandas histogram
plt.clf()
# Display a Seaborn distplot
sns.distplot(df['Award_Amount'])
plt.show()
# Clear the distplot
plt.clf()
# Create a distplot
sns.distplot(df['Award_Amount'],
kde=False,
bins=20)
# Display the plot
plt.show()
Rug plot and kde shading
# Create a distplot of the Award Amount
sns.distplot(df['Award_Amount'],
hist=False,
rug=True,
kde_kws={'shade':True})
# Plot the results
plt.show()
Create a regression plot
print(df.head())
# Create a regression plot of premiums vs. insurance_losses
sns.regplot(data=df,x="insurance_losses",y="premiums")
# Display the plot
plt.show()
# Create an lmplot of premiums vs. insurance_losses
sns.lmplot(data=df,y="premiums",x="insurance_losses")
# Display the second plot
plt.show()
Plotting multiple variables
# Create a regression plot using hue
sns.lmplot(data=df,
x="insurance_losses",
y="premiums",
hue="Region")
# Show the results
plt.show()
Facetting multiple regressions
# Create a regression plot with multiple rows
sns.lmplot(data=df,
x="insurance_losses",
y="premiums",
row="Region")
# Show the plot
plt.show()
Customizing seaborn plots
Setting the default style
# Plot the pandas histogram
df['fmr_2'].plot.hist()
plt.show()
plt.clf()
# Set the default seaborn style
sns.set()
# Plot the pandas histogram again
df['fmr_2'].plot.hist()
plt.show()
plt.clf()
Comparing styles
# Plot with a dark style
sns.set_style('dark')
sns.distplot(df['fmr_2'])
plt.show()
# Clear the figure
plt.clf()
# Plot with a whitegrid style
sns.set_style('whitegrid')
sns.distplot(df['fmr_2'])
plt.show()
# Clear the figure
plt.clf()
Removing spines
# Set the style to white
sns.set_style('white')
# Create a regression plot
sns.lmplot(data=df,
x='pop2010',
y='fmr_2')
# Remove the spines
sns.despine()
# Show the plot and clear the figure
plt.show()
plt.clf()
Matplotlib color codes
# Set style, enable color code, and create a magenta distplot
sns.set(color_codes=True)
sns.distplot(df['fmr_3'], color='m')
# Show the plot
plt.show()
Using default palettes
# Loop through differences between bright and colorblind palettes
for p in ['bright', 'colorblind']:
sns.set_palette(p)
sns.distplot(df['fmr_3'])
plt.show()
# Clear the plots
plt.clf()
Using matplotlib axes
# Create a figure and axes
fig, ax = plt.subplots()
# Plot the distribution of data
sns.distplot(df['fmr_3'], ax=ax)
# Create a more descriptive x axis label
ax.set(xlabel="3 Bedroom Fair Market Rent")
# Show the plot
plt.show()
Additional plot customizations
# Create a figure and axes
fig, ax = plt.subplots()
# Plot the distribution of 1 bedroom rents
sns.distplot(df['fmr_1'], ax=ax)
# Modify the properties of the plot
ax.set(xlabel="1 Bedroom Fair Market Rent",
xlim=(100 ,1500),
title="US Rent")
# Display the plot
plt.show()
Adding annotations
# Create a figure and axes. Then plot the data
fig, ax = plt.subplots()
sns.distplot(df['fmr_1'], ax=ax)
# Customize the labels and limits
ax.set(xlabel="1 Bedroom Fair Market Rent", xlim=(100,1500), title="US Rent")
# Add vertical lines for the median and mean
ax.axvline(x=median, color='m', label='Median', linestyle='--', linewidth=2)
ax.axvline(x=mean, color='b', label='Mean', linestyle='-', linewidth=2)
# Show the legend and plot the data
ax.legend()
plt.show()
Multiple plots
# Create a plot with 1 row and 2 columns that share the y axis label
fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharey=True)
# Plot the distribution of 1 bedroom apartments on ax0
sns.distplot(df['fmr_1'], ax=ax0)
ax0.set(xlabel="1 Bedroom Fair Market Rent", xlim=(100,1500))
# Plot the distribution of 2 bedroom apartments on ax1
sns.distplot(df['fmr_2'], ax=ax1)
ax1.set(xlabel="2 Bedroom Fair Market Rent", xlim=(100,1500))
# Display the plot
plt.show()
Additional plot types
stripplot() and swarmplot()
# Create the stripplot
sns.stripplot(data=df,
x='Award_Amount',
y='Model Selected',
jitter=True)
plt.show()
# Create and display a swarmplot with hue set to the Region
sns.swarmplot(data=df,
x='Award_Amount',
y='Model Selected',
hue='Region')
plt.show()
boxplots, violinplots and lvplots
# Create a boxplot
sns.boxplot(data=df,
x='Award_Amount',
y='Model Selected')
plt.show()
plt.clf()
# Create a violinplot with the husl palette
sns.violinplot(data=df,
x='Award_Amount',
y='Model Selected',
palette='husl')
plt.show()
plt.clf()
# Create a lvplot with the Paired palette and the Region column as the hue
sns.lvplot(data=df,
x='Award_Amount',
y='Model Selected',
palette='Paired',
hue='Region')
plt.show()
plt.clf()
barplots, pointplots and countplots
# Show a countplot with the number of models used with each region a different color
sns.countplot(data=df,
y="Model Selected",
hue="Region")
plt.show()
plt.clf()
# Create a pointplot and include the capsize in order to show caps on the error bars
sns.pointplot(data=df,
y='Award_Amount',
x='Model Selected',
capsize=.1)
plt.show()
plt.clf()
# Create a barplot with each Region shown as a different color
sns.barplot(data=df,
y='Award_Amount',
x='Model Selected',
hue='Region')
plt.show()
plt.clf()
Regression and residual plots
# Display a regression plot for Tuition
sns.regplot(data=df,
y='Tuition',
x="SAT_AVG_ALL",
marker='^',
color='g')
plt.show()
plt.clf()
# Display the residual plot
sns.residplot(data=df,
y='Tuition',
x="SAT_AVG_ALL",
color='g')
plt.show()
plt.clf()
Regression plot parameters
# Plot a regression plot of Tuition and the Percentage of Pell Grants
sns.regplot(data=df,
y='Tuition',
x="PCTPELL")
plt.show()
plt.clf()
# Create another plot that estimates the tuition by PCTPELL
sns.regplot(data=df,
y='Tuition',
x="PCTPELL",
x_bins=5)
plt.show()
plt.clf()
# The final plot should include a line using a 2nd order polynomial
sns.regplot(data=df,
y='Tuition',
x="PCTPELL",
x_bins=5,
order=2)
plt.show()
plt.clf()
Binning data
# Create a scatter plot by disabling the regression line
sns.regplot(data=df,
y='Tuition',
x="UG",
fit_reg=False)
plt.show()
plt.clf()
# Create a scatter plot and bin the data into 5 bins
sns.regplot(data=df,
y='Tuition',
x="UG",
x_bins=5)
plt.show()
plt.clf()
# Create a regplot and bin the data into 8 bins
sns.regplot(data=df,
y='Tuition',
x="UG",
x_bins=8)
plt.show()
plt.clf()
Creating heatmaps
# Create a crosstab table of the data
pd_crosstab = pd.crosstab(df["Group"], df["YEAR"])
print(pd_crosstab)
# Plot a heatmap of the table
sns.heatmap(pd_crosstab)
# Rotate tick marks for visibility
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()
Customizing heatmaps
# Create the crosstab DataFrame
pd_crosstab = pd.crosstab(df["Group"], df["YEAR"])
# Plot a heatmap of the table with no color bar and using the BuGn palette
sns.heatmap(pd_crosstab, cbar=False, cmap="BuGn", linewidths=0.3)
# Rotate tick marks for visibility
plt.yticks(rotation=0)
plt.xticks(rotation=90)
#Show the plot
plt.show()
plt.clf()
Creating plots on data aware grids
Building a FacetGrid
# Create FacetGrid with Degree_Type and specify the order of the rows using row_order
g2 = sns.FacetGrid(df,
row="Degree_Type",
row_order=['Graduate', 'Bachelors', 'Associates', 'Certificate'])
# Map a pointplot of SAT_AVG_ALL onto the grid
g2.map(sns.pointplot, 'SAT_AVG_ALL')
# Show the plot
plt.show()
plt.clf()
Using a factorplot
# Create a factor plot that contains boxplots of Tuition values
sns.factorplot(data=df,
x='Tuition',
kind='box',
row='Degree_Type')
plt.show()
plt.clf()
# Create a facetted pointplot of Average SAT_AVG_ALL scores facetted by Degree Type
sns.factorplot(data=df,
x='SAT_AVG_ALL',
kind='point',
row='Degree_Type',
row_order=['Graduate', 'Bachelors', 'Associates', 'Certificate'])
plt.show()
plt.clf()
Using a lmplot
# Create a FacetGrid varying by column and columns ordered with the degree_order variable
g = sns.FacetGrid(df, col="Degree_Type", col_order=degree_ord)
# Map a scatter plot of Undergrad Population compared to PCTPELL
g.map(plt.scatter, 'UG', 'PCTPELL')
plt.show()
plt.clf()
# Re-create the previous plot as an lmplot
sns.lmplot(data=df,
x='UG',
y='PCTPELL',
col="Degree_Type",
col_order=degree_ord)
plt.show()
plt.clf()
# Create an lmplot that has a column for Ownership, a row for Degree_Type and hue based on the WOMENONLY column
sns.lmplot(data=df,
x='SAT_AVG_ALL',
y='Tuition',
col="Ownership",
row='Degree_Type',
row_order=['Graduate', 'Bachelors'],
hue='WOMENONLY',
col_order=inst_ord)
plt.show()
plt.clf()
Building a PairGrid
# Create a PairGrid with a scatter plot for fatal_collisions and premiums
g = sns.PairGrid(df, vars=["fatal_collisions", "premiums"])
g2 = g.map(plt.scatter)
plt.show()
plt.clf()
# Create the same PairGrid but map a histogram on the diag
g = sns.PairGrid(df, vars=["fatal_collisions", "premiums"])
g2 = g.map_diag(plt.hist)
g3 = g2.map_offdiag(plt.scatter)
plt.show()
plt.clf()
# Create a pairwise plot of the variables using a scatter plot
sns.pairplot(data=df,
vars=["fatal_collisions", "premiums"],
kind='scatter')
plt.show()
plt.clf()
# Plot the same data but use a different color palette and color code by Region
sns.pairplot(data=df,
vars=["fatal_collisions", "premiums"],
kind='scatter',
hue='Region',
palette='RdBu',
diag_kws={'alpha':.5})
plt.show()
plt.clf()
# Build a pairplot with different x and y variables
sns.pairplot(data=df,
x_vars=["fatal_collisions_speeding", "fatal_collisions_alc"],
y_vars=['premiums', 'insurance_losses'],
kind='scatter',
hue='Region',
palette='husl')
plt.show()
plt.clf()
# plot relationships between insurance_losses and premiums
sns.pairplot(data=df,
vars=["insurance_losses", "premiums"],
kind='reg',
palette='BrBG',
diag_kind = 'kde',
hue='Region')
plt.show()
plt.clf()
Building a JointGrid and jointplot
# Build a JointGrid comparing humidity and total_rentals
sns.set_style("whitegrid")
g = sns.JointGrid(x="hum",
y="total_rentals",
data=df,
xlim=(0.1, 1.0))
g.plot(sns.regplot, sns.distplot)
plt.show()
plt.clf()
# Create a jointplot similar to the JointGrid
sns.jointplot(x="hum",
y="total_rentals",
kind='reg',
data=df)
plt.show()
plt.clf()
Jointplots and regression
# Plot temp vs. total_rentals as a regression plot
sns.jointplot(x="temp",
y="total_rentals",
kind='reg',
data=df,
order=2,
xlim=(0, 1))
plt.show()
plt.clf()
# Plot a jointplot showing the residuals
sns.jointplot(x="temp",
y="total_rentals",
kind='resid',
data=df,
order=2)
plt.show()
plt.clf()
Complex jointplots
# Create a jointplot of temp vs. casual riders
# Include a kdeplot over the scatter plot
g = (sns.jointplot(x="temp",
y="casual",
kind='scatter',
data=df,
marginal_kws=dict(bins=10, rug=True))
.plot_joint(sns.kdeplot))
plt.show()
plt.clf()
# Replicate the previous plot but only for registered riders
g = (sns.jointplot(x="temp",
y="registered",
kind='scatter',
data=df,
marginal_kws=dict(bins=10, rug=True))
.plot_joint(sns.kdeplot))
plt.show()
plt.clf()
0 comments:
Post a Comment