This blog post is continuity of my learning journey of data scientist
with python through Datacamp. You can see detail illustration and
execution of following scenarios at my kaggle workspace & github. For original course kindly refer here
Read, clean, and validate
# Select the columns and divide by 100
agecon = nsfg['agecon'] / 100
agepreg = nsfg['agepreg'] / 100
# Compute the difference
preg_length = agepreg - agecon
# Compute summary statistics
print(preg_length.describe())
Filter and visualize
# Plot the histogram
plt.hist(agecon, bins=20, histtype='step')
# Label the axes
plt.xlabel('Age at conception')
plt.ylabel('Number of pregnancies')
# Show the figure
plt.show()
Distributions
Probability Mass Functions
year=gss['year']
# Compute the PMF for year
pmf_year = Pmf(year, normalize=False)
# Print the result
print(pmf_year)
# Select the age column
age = gss['age']
# Make a PMF of age
pmf_age = Pmf(age)
# Plot the PMF
pmf_age.bar()
# Label the axes
plt.xlabel('Age')
plt.ylabel('PMF')
plt.show()
Make a CDF
# Select the age column
age = gss['age']
# Compute the CDF of age
cdf_age = Cdf(age)
# Calculate the CDF of 30
print(cdf_age(30))
Compute IQR
# Calculate the 75th percentile
income=gss['realinc']
cdf_income=Cdf.from_seq(income)
income=gss['realinc']
cdf_income=Cdf.from_seq(income)
# Calculate the 75th percentile
percentile_75th = cdf_income.inverse(0.75)
# Calculate the 25th percentile
percentile_25th = cdf_income.inverse(0.25)
# Calculate the interquartile range
iqr = percentile_75th - percentile_25th
# Print the interquartile range
print(iqr)
Plot a CDF
# Select realinc
income = gss['realinc']
# Make the CDF
cdf_income = Cdf(income)
# Plot it
cdf_income.plot()
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
plt.show()
Comparing distributions
Plot income CDFs
educ = gss['educ'] # Bachelor`s degree bach = (educ >= 16) # Associate degree assc = ((educ >= 14) & (educ < 16)) # High school (12 or fewer years of education) high = (educ <= 12) print(high.mean())
0.5308807991547402
income = gss['realinc']
# Plot the CDFs
Cdf(income[high]).plot(label='High school')
Cdf(income[assc]).plot(label='Associate')
Cdf(income[bach]).plot(label='Bachelor')
# Label the axes
plt.xlabel('Income (1986 USD)')
plt.ylabel('CDF')
plt.legend()
plt.show()
Modeling distributions
Distribution of income
# Extract realinc and compute its log
income = gss['realinc']
log_income = np.log10(income)
# Compute mean and standard deviation
mean = np.mean(log_income)
std = np.std(log_income)
print(mean, std)
# Make a norm object
from scipy.stats import norm
dist = norm(mean,std)
Comparing CDFs
# Evaluate the model CDF
xs = np.linspace(2, 5.5)
ys = dist.cdf(xs)
# Plot the model CDF
plt.clf()
plt.plot(xs, ys, color='gray')
# Create and plot the Cdf of log_income
Cdf(log_income).plot()
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('CDF')
plt.show()
Comparing PDFs
# Evaluate the normal PDF
xs = np.linspace(2, 5.5)
ys = dist.pdf(xs)
# Plot the model PDF
plt.clf()
plt.plot(xs, ys, color='gray')
# Plot the data KDE
sns.kdeplot(log_income)
# Label the axes
plt.xlabel('log10 of realinc')
plt.ylabel('PDF')
plt.show()
Relationships
Exploring relationship
#print(brfss.head())
# Extract age
age = Pmf(brfss['AGE'])
# Plot the PMF
pmf_age = Pmf(age)
pmf_age.bar()
# Label the axes
plt.xlabel('Age in years')
plt.ylabel('PMF')
plt.show()
Scatter plot
# Select the first 1000 respondents
brfss = brfss[:1000]
# Extract age and weight
age = brfss['AGE']
weight = brfss['WTKG3']
# Make a scatter plot
plt.plot(age,weight,'o',alpha=0.1)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.show()
Jittering
# Select the first 1000 respondents
brfss = brfss[:1000]
# Add jittering to age
age = brfss['AGE'] + np.random.normal(0,2.5, size=len(brfss))
# Extract weight
weight = brfss['WTKG3']
# Make a scatter plot
plt.plot(age, weight, 'o', markersize=5, alpha=0.2)
plt.xlabel('Age in years')
plt.ylabel('Weight in kg')
plt.show()
Visualizing relation
# Drop rows with missing data
data = brfss.dropna(subset=['_HTMG10', 'WTKG3'])
# Make a box plot
sns.boxplot(x='_HTMG10',y='WTKG3',data=data,whis=10)
# Plot the y-axis on a log scale
plt.yscale('log')
# Remove unneeded lines and label axes
sns.despine(left=True, bottom=True)
plt.xlabel('Height in cm')
plt.ylabel('Weight in kg')
plt.show()
Distribution of income
# Extract income
income = brfss['INCOME2']
# Plot the PMF
Pmf.from_seq(income).bar()
# Label the axes
plt.xlabel('Income level')
plt.ylabel('PMF')
plt.show()
Income & Height
# Drop rows with missing data
data = brfss.dropna(subset=['INCOME2', 'HTM4'])
# Make a violin plot
sns.violinplot(x='INCOME2',y='HTM4',data=data,inner=None)
# Remove unneeded lines and label axes
sns.despine(left=True, bottom=True)
plt.xlabel('Income level')
plt.ylabel('Height in cm')
plt.show()
correlations
# Select columns
columns = ['AGE','INCOME2','_VEGESU1']
subset = brfss[columns]
# Compute the correlation matrix
print(subset.corr())
Simple Regression
from scipy.stats import linregress
# Extract the variables
subset = brfss.dropna(subset=['INCOME2', '_VEGESU1'])
xs = subset['INCOME2']
ys = subset['_VEGESU1']
# Compute the linear regression
res = linregress(xs,ys)
print(res)
# Plot the scatter plot
plt.clf()
x_jitter = xs + np.random.normal(0, 0.15, len(xs))
plt.plot(x_jitter, ys, 'o', alpha=0.2)
# Plot the line of best fit
fx = np.array([xs.min(), xs.max()])
fy = res.intercept + res.slope * fx
plt.plot(fx, fy, '-', alpha=0.7)
plt.xlabel('Income code')
plt.ylabel('Vegetable servings per day')
plt.ylim([0, 6])
plt.show()
Multivariate Thinking
Using StatsModels
from scipy.stats import linregress
import statsmodels.formula.api as smf
# Run regression with linregress
subset = brfss.dropna(subset=['INCOME2', '_VEGESU1'])
xs = subset['INCOME2']
ys = subset['_VEGESU1']
res = linregress(xs,ys)
print(res)
# Run regression with StatsModels
results = smf.ols('_VEGESU1 ~ INCOME2', data = brfss).fit()
print(results.params)
Multiple regression
# Group by educ
grouped = gss.groupby('educ')
# Compute mean income in each group
mean_income_by_educ = grouped['realinc'].mean()
# Plot mean income as a scatter plot
plt.plot(mean_income_by_educ,'o',alpha=0.5)
# Label the axes
plt.xlabel('Education (years)')
plt.ylabel('Income (1986 $)')
plt.show()
Non-linear model of education
import statsmodels.formula.api as smf
# Add a new column with educ squared
gss['age2'] = gss['age'] ** 2
gss['educ2'] = gss['educ'] ** 2
# Run a regression model with educ, educ2, age, and age2
results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit()
# Print the estimated parameters
print(results.params)
Visualizing regression results
# Run a regression model with educ, educ2, age, and age2
results = smf.ols('realinc ~ educ + educ2 + age + age2', data=gss).fit()
# Make the DataFrame
df = pd.DataFrame()
df['educ'] = np.linspace(0, 20)
df['age'] = 30
df['educ2'] = df['educ']**2
df['age2'] = df['age']**2
# Generate and plot the predictions
pred = results.predict(df)
print(pred.head())
# Plot mean income in each age group
plt.clf()
grouped = gss.groupby('educ')
mean_income_by_educ = grouped['realinc'].mean()
plt.plot(mean_income_by_educ, 'o', alpha=0.5)
# Plot the predictions
pred = results.predict(df)
plt.plot(df['educ'], pred, label='Age 30')
# Label axes
plt.xlabel('Education (years)')
plt.ylabel('Income (1986 $)')
plt.legend()
plt.show()
Logistic regression
# Recode grass
gss['grass'].replace(2, 0, inplace=True)
# Run logistic regression
results = smf.logit('grass ~ age + age2 + educ + educ2 + C(sex)', data=gss).fit()
results.params
# Make a DataFrame with a range of ages
df = pd.DataFrame()
df['age'] = np.linspace(18, 89)
df['age2'] = df['age']**2
# Set the education level to 12
df['educ'] = 12
df['educ2'] = df['educ']**2
# Generate predictions for men and women
df['sex'] = 1
pred1 = results.predict(df)
df['sex'] = 2
pred2 = results.predict(df)
plt.clf()
grouped = gss.groupby('age')
favor_by_age = grouped['grass'].mean()
plt.plot(favor_by_age, 'o', alpha=0.5)
plt.plot(df['age'], pred1, label='Male')
plt.plot(df['age'],pred2,label='Female')
plt.xlabel('Age')
plt.ylabel('Probability of favoring legalization')
plt.legend()
plt.show()
0 comments:
Post a Comment