This assignment focuses on building machine learning models to predict in-hospital and 30-day mortality for ICU patients using real-world clinical data. We worked with a dataset from the Beth Israel Deaconess Medical Center, which contains timestamped medical observations and mortality labels. The core task wass to extract meaningful features from these time-series records and use them to train classification models that can identify high-risk patients.
The assignment is divided into three major components:
First, we implement a feature extraction pipeline, transforming each patient's time-series and static data into a structured feature vector. This includes handling missing values, normalizing feature scales, and converting raw inputs into model-ready formats.
Next, we explored model training and hyperparameter selection using logistic regression with both L1 and L2 regularization, evaluating model performance through k-fold cross-validation on several metrics like AUROC, F1-score, and accuracy.
Finally, the challenge section made us apply the full pipeline on a larger dataset of 10,000 patients to predict 30-day mortality on a held-out set and submit predictions for evaluation.
hw3_main.py
# part of hw3_main.py
# includes code for preprocessing step
import warnings
warnings.filterwarnings('ignore', category=exceptions.UndefinedMetricWarning)
from helper import *
def generate_feature_vector(df):
"""
Reads a dataframe containing all measurements for a single patient
within the first 48 hours of the ICU admission, and convert it into
a feature vector.
Input:
df: pd.Dataframe, with columns [Time, Variable, Value]
Returns:
a python dictionary of format {feature_name: feature_value}
for example, {'Age': 32, 'Gender': 0, 'mean_HR': 84, ...}
"""
static_variables = config['static']
timeseries_variables = config['timeseries']
feature_dict = {}
feature_count = {}
for var, val in zip(df['Variable'], df['Value']):
# create a dictionary that counts the number of features occurring in dataframe
if var in timeseries_variables: # static_variables only appear once, so only consider timeseries
if var not in feature_count: feature_count[var] = 1
else: feature_count[var] += 1
# create a dictionary that stores the sum for each variable's value with key in the form of mean_{variable name}
if var in static_variables:
if val == -1.0: feature_dict[var] = np.nan # handling missing cases
else: feature_dict[var] = val
elif var in timeseries_variables:
if 'mean_'+var not in feature_dict: feature_dict['mean_'+var] = val
else: feature_dict['mean_' + var] += val
# now we have two dictionaries. One with count of variables, and one with sum of each variable's values
# divide the sum with total count to get the mean value
for var in feature_count: feature_dict['mean_'+var] /= feature_count[var]
# now we have a variable with mean values.
# let's fill in the missing timeseries_variables
for var in timeseries_variables:
if 'mean_'+var not in feature_dict: feature_dict['mean_'+var] = np.nan
# Compute additional statistical features
for var in ['HR', 'MAP', 'Glucose']:
values = df[df['Variable'] == var]['Value']
if not values.empty: # Avoid division by zero
feature_dict['std_'+var] = np.std(values)
feature_dict['min_'+var] = np.min(values)
else:
feature_dict['std_'+var] = np.nan
feature_dict['min_'+var] = np.nan
# One-hot encode nominal variables
nominal_vars = ['ICUType', 'Gender']
for var in nominal_vars:
# Filter the rows where the variable is equal to the current var
values = df[df['Variable'] == var]['Value'].values.reshape(-1, 1)
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit the encoder to the values and transform them into one-hot encoding
encoded = encoder.fit_transform(values)
# Add the one-hot encoded values to the feature dictionary
for i, category in enumerate(encoder.categories_[0]):
feature_dict[f'{var}_{category}'] = encoded[0][i]
# add age_hr_info
mean_HR = feature_dict['mean_HR']
feature_dict['age_hr_info'] = feature_dict['Age'] * (1 / mean_HR + 0.01) # avoid division by zero
print(feature_dict)
return feature_dict
def impute_missing_values(X):
"""
For each feature column, impute missing values (np.nan) with the
population mean for that feature. If more than half of the values
in a column are missing, remove that column entirely.
Input:
X: np.array, shape (N, d). X could contain missing values
Returns:
X: np.array, shape (N, d'). X does not contain any missing values,
and columns with more than half missing values are removed.
"""
# Calculate the number of missing values per column
missing_counts = np.sum(np.isnan(X), axis=0)
# Identify columns where less than half the values are missing
valid_columns = missing_counts < (X.shape[0] / 2)
# Keep only valid columns
X = X[:, valid_columns]
# Compute column means for remaining columns
col_means = np.nanmean(X, axis=0)
# Find NaN indices and replace with column means
NaN_idx = np.isnan(X)
X[NaN_idx] = np.take(col_means, np.where(NaN_idx)[1])
return X
def normalize_feature_matrix(X):
"""
For each feature column, normalize all values to range [0, 1].
Input:
X: np.array, shape (N, d).
Returns:
X: np.array, shape (N, d). Values are normalized per column.
"""
d = X.shape[1] # number of columns
for c in range(d):
min_val = np.nanmin(X[:, c])
max_val = np.nanmax(X[:, c])
if min_val == max_val:
X[:,c] = 0.0
else:
X[:, c] = (X[:, c] - min_val) / (max_val - min_val)
# for each row of the column c, replace with normalized value
return X
...
hw3_challange.py
# hw3_challenge.py
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.linear_model import LogisticRegression
# Personally imported functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
import hw3_main
from helper import *
def generate_feature_vector_challenge(df):
return hw3_main.generate_feature_vector(df)
def impute_missing_values_challenge(X):
return hw3_main.impute_missing_values(X)
def normalize_feature_matrix_challenge(X):
return hw3_main.normalize_feature_matrix(X)
def print_GridSearchCV_result(X_train, y_train, X_test, y_test, penalty, scoring):
clf = GridSearchCV(
LogisticRegression(penalty=penalty, class_weight={-1:1, 1:4}, solver='saga', max_iter=5000),
{'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]},
scoring=scoring,
cv=StratifiedKFold(n_splits=5)
)
# Train the classifier
clf.fit(X_train, y_train)
# Print what penalty and scoring was used
print("=======================================")
print("For C in range of 0.001 to 1000 with x10 increments")
print("Best C for Penalty of",penalty, "and scoring of", scoring,"is C =",clf.best_params_['C'])
# Get the best model
best_clf = clf.best_estimator_
# Get predictions on validation set
y_pred = best_clf.predict(X_test)
y_prob = best_clf.predict_proba(X_test)[:, 1] # Probability for positive class
# Compute confusion matrix
cm = metrics.confusion_matrix(y_test, y_pred)
# Print confusion matrix in a clear format
print("\\nConfusion Matrix:")
print(f"TP: {cm[1, 1]} FP: {cm[0, 1]}")
print(f"FN: {cm[1, 0]} TN: {cm[0, 0]}")
# Compute evaluation metrics
acc = (cm[1, 1] + cm[0, 0]) / (cm[0, 0] + cm[1, 0] + cm[0, 1] + cm[1, 1])
auroc = metrics.roc_auc_score(y_test, y_prob)
f1 = metrics.f1_score(y_test, y_pred)
# Print results
print("\\nEvaluation Metrics:")
print(f"Accuracy: {acc:.4f}")
print(f"AUROC: {auroc:.4f}")
print(f"F1 Score: {f1:.4f}")
return best_clf
def run_challenge(X_challenge, y_challenge, X_heldout):
# TODO:
# Read challenge data
# Train a linear classifier and apply to heldout dataset features
# Use make_challenge_submission to print the predicted labels
print("================= Part 3 ===================")
print("Part 3: Challenge")
X_train, X_test, y_train, y_test = train_test_split(X_challenge, y_challenge, test_size=0.2)
# Get the best model
clf1 = print_GridSearchCV_result(X_train, y_train, X_test, y_test, 'l2', 'f1')
clf2 = print_GridSearchCV_result(X_train, y_train, X_test, y_test, 'l2', 'roc_auc')
# y_score = clf?.predict_proba(X_heldout)[:, 1]
# y_label = clf?.predict(X_heldout)
# make_challenge_submission(y_label, y_score)
if __name__ == '__main__':
np.random.seed(42)
# Read challenge data
X_challenge, y_challenge, X_heldout, feature_names = get_challenge_data()
run_challenge(X_challenge, y_challenge, X_heldout)
# test_challenge_output()
# HW3 - helper.py
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import yaml
config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)
import hw3_main
import hw3_challenge
def get_train_test_split():
"""
This function performs the following steps:
- Reads in the data from data/labels.csv and data/files/*.csv (keep only the first 2,500 examples)
- Generates a feature vector for each example
- Aggregates feature vectors into a feature matrix (features are sorted alphabetically by name)
- Performs imputation and normalization with respect to the population
After all these steps, it splits the data into 80% train and 20% test.
The binary labels take two values:
-1: survivor
+1: died in hospital
Returns the features and labesl for train and test sets, followed by the names of features.
"""
df_labels = pd.read_csv('data/labels.csv')
df_labels = df_labels[:2500]
IDs = df_labels['RecordID'][:2500]
raw_data = {}
for i in tqdm(IDs, desc='Loading files from disk'):
raw_data[i] = pd.read_csv('data/files/{}.csv'.format(i))
features = Parallel(n_jobs=16)(delayed(hw3_main.generate_feature_vector)(df) for _, df in tqdm(raw_data.items(), desc='Generating feature vectors'))
df_features = pd.DataFrame(features).sort_index(axis=1)
feature_names = df_features.columns.tolist()
X, y = df_features.values, df_labels['In-hospital_death'].values
X = hw3_main.impute_missing_values(X)
X = hw3_main.normalize_feature_matrix(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=3)
return X_train, y_train, X_test, y_test, feature_names
def get_classifier(penalty='l2', C=1.0, class_weight=None):
"""
Returns a logistic regression classifier based on the given
penalty function, regularization parameter C, and class weights.
"""
if penalty == 'l2':
return LogisticRegression(penalty=penalty, C=C, class_weight=class_weight, solver='lbfgs', max_iter=1000)
elif penalty == 'l1':
return LogisticRegression(penalty=penalty, C=C, class_weight=class_weight, solver='saga', max_iter=5000)
else:
raise ValueError('Error: unsupported configuration')
def get_challenge_data():
"""
This function is similar to helper.get_train_test_split, except that:
- It reads in all 10,000 training examples
- It does not return labels for the 2,000 examples in the heldout test set
You should replace your preprocessing functions (generate_feature_vector,
impute_missing_values, normalize_feature_matrix) with updated versions for the challenge
"""
df_labels = pd.read_csv('data/labels.csv')
df_labels = df_labels
IDs = df_labels['RecordID']
raw_data = {}
for i in tqdm(IDs, desc='Loading files from disk'):
raw_data[i] = pd.read_csv('data/files/{}.csv'.format(i))
features = Parallel(n_jobs=16)(delayed(hw3_challenge.generate_feature_vector_challenge)(df) for _, df in tqdm(raw_data.items(), desc='Generating feature vectors'))
df_features = pd.DataFrame(features)
feature_names = df_features.columns.tolist()
X, y = df_features.values, df_labels['30-day_mortality'].values
X = hw3_challenge.impute_missing_values_challenge(X)
X = hw3_challenge.normalize_feature_matrix_challenge(X)
return X[:10000], y[:10000], X[10000:], feature_names
def make_challenge_submission(y_label, y_score):
"""
Takes in `y_label` and `y_score`, which are two list-like objects that contain
both the binary predictions and raw scores from your classifier.
Outputs the prediction to challenge.csv.
Please make sure that you do not change the order of the test examples in the heldout set
since we will use this file to evaluate your classifier.
"""
print('Saving challenge output...')
pd.DataFrame({'label': y_label.astype(int), 'risk_score': y_score}).to_csv('challenge.csv', index=False)
print('challenge.csv saved')
return
def test_challenge_output():
import csv
with open('challenge.csv', newline='') as csvfile:
filereader = csv.reader(csvfile)
i = 0
for row in filereader:
if i == 0:
if row[0] != 'label':
print('INVALID COLUMN NAME: column name is not label.')
else:
rating = int(row[0])
if rating != -1 and rating != 0 and rating != 1:
print('INVALID VALUE: values need to be -1, 0, or 1.')
i += 1
if i != 2001:
print('INVALID NUMBER OF ROWS: number of rows is not 2001.')
print('SUCCESS: csv file is valid.')
return