# CHAPTER 30
Final Projects and Real-World Applications
#
1. Chapter Introduction
#
This final chapter synthesizes every skill from the course into 6 complete, portfolio-ready data science projects — from financial analytics to machine learning preprocessing pipelines.
---
python
{ copied = true; setTimeout(() => copied = false, 2000) })" class="text-slate-400 hover:text-white p-1 rounded hover:bg-slate-800 transition relative flex items-center justify-center" title="Copy code">
---
python
{ copied = true; setTimeout(() => copied = false, 2000) })" class="text-slate-400 hover:text-white p-1 rounded hover:bg-slate-800 transition relative flex items-center justify-center" title="Copy code">
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
import pandas as pd
import numpy as np
class HRAnalytics:
def __init__ (self, n=500 ):
np.random.seed (42 )
self.df = pd.DataFrame ({
'Employee_ID': [f'E{i:04d}' for i in range(1, n+1)],
'Dept': np.random.choice(['Engineering','Marketing','Sales','HR','Finance'], n,
p=[0.35 , 0.20 , 0.25 , 0.10 , 0.10 ]),
'Gender': np.random.choice(['Male','Female'], n, p=[0.55, 0.45]),
'Age': np.random.randint(22, 60, n),
'Tenure': np.random.randint(1, 20, n),
'Salary': np.random.normal(75000, 20000, n).clip(30000, 200000).astype(int),
'Performance': np.random.choice([1,2,3,4,5], n, p=[0.05,0.15,0.35,0.30,0.15]),
'Satisfaction': np.random.uniform(1, 10, n).round(1),
'Left': np.nan
})
# Attrition model: low satisfaction + low performance → higher churn
churn_p = (0.3 * (self.df['Satisfaction'] < 5).astype(float) +
0.2 * (self.df['Performance'] <= 2).astype(float) +
0.15 * (self.df['Tenure'] < 2).astype(float))
self.df['Left'] = (np.random.random(n) < churn_p / churn_p.max() * 0.35).astype(int)
def headcount_report (self):
print ("=== HEADCOUNT REPORT ===")
print (self.df.groupby ('Dept').agg(
Count=('Employee_ID','count'),
Avg_Salary=('Salary','mean'),
Avg_Performance=('Performance','mean'),
Avg_Satisfaction=('Satisfaction','mean')
).round (2 ))
def diversity_report (self):
print ("\n=== GENDER DIVERSITY ===")
pivot = pd.crosstab (self.df['Dept'], self.df['Gender'], normalize='index') * 100
print (pivot.round (1 ))
def attrition_report (self):
print ("\n=== ATTRITION ANALYSIS ===")
overall = self.df['Left'].mean() * 100
print (f"Overall attrition rate: {overall:.1f}%")
print ("\nAttrition by Department:")
print ((self.df.groupby ('Dept')['Left'].mean() * 100).round(1).sort_values(ascending=False))
high_risk = self.df[(self.df['Satisfaction'] < 4) & (self.df['Left'] == 0)]
print (f"\nHigh-risk employees (low satisfaction, still active): {len (high_risk)}")
hr = HRAnalytics ()
hr.headcount_report ()
hr.diversity_report ()
hr.attrition_report ()
---
python
{ copied = true; setTimeout(() => copied = false, 2000) })" class="text-slate-400 hover:text-white p-1 rounded hover:bg-slate-800 transition relative flex items-center justify-center" title="Copy code">
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
class MLPreprocessingPipeline:
"""Complete, reusable ML preprocessing pipeline."""
def __init__ (self):
self.imputer = SimpleImputer (strategy='median')
self.scaler = StandardScaler ()
self.fitted = False
def clean (self, df):
df = df.drop_duplicates ()
df = df.dropna (thresh=int (len (df.columns) * 0.5 )) # Drop rows with >50% null
return df
def encode (self, df, cat_cols):
return pd.get_dummies (df, columns=cat_cols, drop_first=True )
def fit_transform (self, df, target_col, cat_cols=None , drop_cols=None ):
df = self.clean (df)
if drop_cols: df = df.drop (columns=drop_cols)
if cat_cols: df = self.encode (df, cat_cols)
X = df.drop (columns=[target_col])
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2 , random_state=42 )
X_train_imputed = self.imputer.fit_transform (X_train)
X_test_imputed = self.imputer.transform (X_test)
X_train_scaled = self.scaler.fit_transform (X_train_imputed)
X_test_scaled = self.scaler.transform (X_test_imputed)
self.fitted = True
self.feature_names = X_train.columns.tolist ()
print (f"Pipeline complete: {X_train_scaled.shape[0]} train, {X_test_scaled.shape[0]} test samples")
print (f"Features: {len (self.feature_names)}")
return X_train_scaled, X_test_scaled, y_train, y_test
# Usage
np.random.seed (42 )
n = 1000
df = pd.DataFrame ({
'Age': np.random.normal(35, 10, n).clip(18, 70).astype(int),
'Salary': np.random.normal(60000, 20000, n),
'Dept': np.random.choice(['Eng','Mkt','Sales'], n),
'Experience': np.random.randint(0, 30, n),
'Left': np.random.choice([0, 1], n, p=[0.75, 0.25])
})
df.loc[np.random.choice (n, 50 , replace=False ), 'Salary'] = np.nan
pipeline = MLPreprocessingPipeline ()
X_train, X_test, y_train, y_test = pipeline.fit_transform (
df, target_col='Left', cat_cols=['Dept']
)
MCQs
#
Question 1
Sharpe Ratio measures?
A
Total return
B
Return per unit of risk (return / volatility)
C
Maximum loss —
Check Answer
Question 2
np.cumprod(1 + returns) simulates?
A
Additive returns
B
Compound growth (multiplicative returns)
C
Linear trend —
Check Answer
Question 3
HR attrition rate is?
A
Hire rate
B
Percentage of employees who left in a period
C
Salary growth —
Check Answer
Question 4
SimpleImputer(strategy='median') fills with?
A
Mean
B
Column median (robust to outliers)
C
Mode —
Check Answer
Question 5
drop_first=True in get_dummies prevents?
A
Errors
B
Dummy variable trap (perfect multicollinearity)
C
Nulls —
Check Answer
Question 6
thresh=int(len(df.columns)*0.5) in dropna keeps rows with?
A
No nulls
B
At least 50% non-null values
C
All nulls —
Check Answer
Question 7
pd.crosstab(normalize='index') shows?
A
Counts
B
Row percentages (proportions within each row)
C
Column percentages —
Check Answer
Question 8
Max drawdown measures?
A
Minimum price
B
Peak-to-trough percentage decline
C
Volatility —
Check Answer
Question 9
fit_transform vs transform on scaler for test data?
A
Same
B
Test data uses only transform (parameters from training)
C
Neither —
Check Answer
Question 10
OOP for data science projects enables?
A
Speed
B
Reusable, modular, testable pipelines
C
Less code —
Check Answer
Interview Questions
#
Q: Design a complete data preprocessing pipeline for a churn prediction model.
Q: How would you build a financial dashboard using Pandas time series features?
Course Complete! 🎉
#
text
{ copied = true; setTimeout(() => copied = false, 2000) })" class="text-slate-400 hover:text-white p-1 rounded hover:bg-slate-800 transition relative flex items-center justify-center" title="Copy code">
Finish this Chapter
Save your progress on your learning path and prepare for coding interview challenges.
r.json()).then(res => { saving = false; if(res.success) { completed = true; $dispatch('lesson-complete'); } else console.error(res.error); }).catch(() => saving = false) }"
class="w-full sm:w-auto inline-flex items-center justify-center px-8 py-3.5 text-base font-bold rounded-xl text-white transition-all duration-300 transform hover:-translate-y-0.5 active:translate-y-0 shadow-lg"
:class="completed ? 'bg-emerald-500 hover:bg-emerald-600 shadow-emerald-500/15 dark:shadow-none' : 'bg-indigo-600 hover:bg-indigo-700 shadow-indigo-600/15 dark:shadow-none'">
Was this lesson helpful?
👍 Yes
👎 No
Thanks for your feedback! 🙏