Introduction

This assignment focuses on building machine learning models to predict in-hospital and 30-day mortality for ICU patients using real-world clinical data. We worked with a dataset from the Beth Israel Deaconess Medical Center, which contains timestamped medical observations and mortality labels. The core task wass to extract meaningful features from these time-series records and use them to train classification models that can identify high-risk patients.

The assignment is divided into three major components:

First, we implement a feature extraction pipeline, transforming each patient's time-series and static data into a structured feature vector. This includes handling missing values, normalizing feature scales, and converting raw inputs into model-ready formats.

Next, we explored model training and hyperparameter selection using logistic regression with both L1 and L2 regularization, evaluating model performance through k-fold cross-validation on several metrics like AUROC, F1-score, and accuracy.

Finally, the challenge section made us apply the full pipeline on a larger dataset of 10,000 patients to predict 30-day mortality on a held-out set and submit predictions for evaluation.


Python code

hw3_main.py

# part of hw3_main.py
# includes code for preprocessing step

import warnings
warnings.filterwarnings('ignore', category=exceptions.UndefinedMetricWarning)

from helper import *

def generate_feature_vector(df):
    """
    Reads a dataframe containing all measurements for a single patient
    within the first 48 hours of the ICU admission, and convert it into
    a feature vector.

    Input:
        df: pd.Dataframe, with columns [Time, Variable, Value]

    Returns:
        a python dictionary of format {feature_name: feature_value}
        for example, {'Age': 32, 'Gender': 0, 'mean_HR': 84, ...}
    """
    static_variables = config['static']
    timeseries_variables = config['timeseries']
    feature_dict = {}
    feature_count = {}

    for var, val in zip(df['Variable'], df['Value']):
        # create a dictionary that counts the number of features occurring in dataframe
        if var in timeseries_variables: # static_variables only appear once, so only consider timeseries
            if var not in feature_count: feature_count[var] = 1
            else: feature_count[var] += 1
        # create a dictionary that stores the sum for each variable's value with key in the form of mean_{variable name}
        if var in static_variables:
            if val == -1.0: feature_dict[var] = np.nan # handling missing cases
            else: feature_dict[var] = val
        elif var in timeseries_variables:
            if 'mean_'+var not in feature_dict: feature_dict['mean_'+var] = val
            else: feature_dict['mean_' + var] += val
    # now we have two dictionaries. One with count of variables, and one with sum of each variable's values
    # divide the sum with total count to get the mean value
    for var in feature_count: feature_dict['mean_'+var] /= feature_count[var]
    # now we have a variable with mean values.
    # let's fill in the missing timeseries_variables
    for var in timeseries_variables:
        if 'mean_'+var not in feature_dict: feature_dict['mean_'+var] = np.nan

    # Compute additional statistical features
    for var in ['HR', 'MAP', 'Glucose']:
        values = df[df['Variable'] == var]['Value']
        if not values.empty:  # Avoid division by zero
            feature_dict['std_'+var] = np.std(values)
            feature_dict['min_'+var] = np.min(values)
        else:
            feature_dict['std_'+var] = np.nan
            feature_dict['min_'+var] = np.nan

    # One-hot encode nominal variables
    nominal_vars = ['ICUType', 'Gender']
    for var in nominal_vars:
        # Filter the rows where the variable is equal to the current var
        values = df[df['Variable'] == var]['Value'].values.reshape(-1, 1)
        # Initialize the OneHotEncoder
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        # Fit the encoder to the values and transform them into one-hot encoding
        encoded = encoder.fit_transform(values)
        # Add the one-hot encoded values to the feature dictionary
        for i, category in enumerate(encoder.categories_[0]):
            feature_dict[f'{var}_{category}'] = encoded[0][i]

        # add age_hr_info
        mean_HR = feature_dict['mean_HR']
        feature_dict['age_hr_info'] = feature_dict['Age'] * (1 / mean_HR + 0.01)  # avoid division by zero

    print(feature_dict)

    return feature_dict

def impute_missing_values(X):
    """
    For each feature column, impute missing values (np.nan) with the
    population mean for that feature. If more than half of the values
    in a column are missing, remove that column entirely.

    Input:
        X: np.array, shape (N, d). X could contain missing values
    Returns:
        X: np.array, shape (N, d'). X does not contain any missing values,
           and columns with more than half missing values are removed.
    """
    # Calculate the number of missing values per column
    missing_counts = np.sum(np.isnan(X), axis=0)

    # Identify columns where less than half the values are missing
    valid_columns = missing_counts < (X.shape[0] / 2)

    # Keep only valid columns
    X = X[:, valid_columns]

    # Compute column means for remaining columns
    col_means = np.nanmean(X, axis=0)

    # Find NaN indices and replace with column means
    NaN_idx = np.isnan(X)
    X[NaN_idx] = np.take(col_means, np.where(NaN_idx)[1])

    return X

def normalize_feature_matrix(X):
    """
    For each feature column, normalize all values to range [0, 1].

    Input:
        X: np.array, shape (N, d).
    Returns:
        X: np.array, shape (N, d). Values are normalized per column.
    """
    d = X.shape[1] # number of columns

    for c in range(d):
        min_val = np.nanmin(X[:, c])
        max_val = np.nanmax(X[:, c])
        if min_val == max_val:
            X[:,c] = 0.0
        else:
            X[:, c] = (X[:, c] - min_val) / (max_val - min_val)
        # for each row of the column c, replace with normalized value
    return X

...

hw3_challange.py

# hw3_challenge.py

import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

from sklearn.linear_model import LogisticRegression

# Personally imported functions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import hw3_main
from helper import *

def generate_feature_vector_challenge(df):
    return hw3_main.generate_feature_vector(df)

def impute_missing_values_challenge(X):
    return hw3_main.impute_missing_values(X)

def normalize_feature_matrix_challenge(X):
    return hw3_main.normalize_feature_matrix(X)

def print_GridSearchCV_result(X_train, y_train, X_test, y_test, penalty, scoring):

    clf = GridSearchCV(
        LogisticRegression(penalty=penalty, class_weight={-1:1, 1:4}, solver='saga', max_iter=5000),
        {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]},
        scoring=scoring,
        cv=StratifiedKFold(n_splits=5)
    )

    # Train the classifier
    clf.fit(X_train, y_train)

    # Print what penalty and scoring was used
    print("=======================================")
    print("For C in range of 0.001 to 1000 with x10 increments")
    print("Best C for Penalty of",penalty, "and scoring of", scoring,"is C =",clf.best_params_['C'])

    # Get the best model
    best_clf = clf.best_estimator_

    # Get predictions on validation set
    y_pred = best_clf.predict(X_test)
    y_prob = best_clf.predict_proba(X_test)[:, 1]  # Probability for positive class

    # Compute confusion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    # Print confusion matrix in a clear format
    print("\\nConfusion Matrix:")
    print(f"TP: {cm[1, 1]}  FP: {cm[0, 1]}")
    print(f"FN: {cm[1, 0]}  TN: {cm[0, 0]}")

    # Compute evaluation metrics
    acc = (cm[1, 1] + cm[0, 0]) / (cm[0, 0] + cm[1, 0] + cm[0, 1] + cm[1, 1])
    auroc = metrics.roc_auc_score(y_test, y_prob)
    f1 = metrics.f1_score(y_test, y_pred)

    # Print results
    print("\\nEvaluation Metrics:")
    print(f"Accuracy: {acc:.4f}")
    print(f"AUROC: {auroc:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return best_clf

def run_challenge(X_challenge, y_challenge, X_heldout):
    # TODO:
    # Read challenge data
    # Train a linear classifier and apply to heldout dataset features
    # Use make_challenge_submission to print the predicted labels
    print("================= Part 3 ===================")
    print("Part 3: Challenge")

    X_train, X_test, y_train, y_test = train_test_split(X_challenge, y_challenge, test_size=0.2)

    # Get the best model
    clf1 = print_GridSearchCV_result(X_train, y_train, X_test, y_test, 'l2', 'f1')
    clf2 = print_GridSearchCV_result(X_train, y_train, X_test, y_test, 'l2', 'roc_auc')

    # y_score = clf?.predict_proba(X_heldout)[:, 1]
    # y_label = clf?.predict(X_heldout)
    # make_challenge_submission(y_label, y_score)

if __name__ == '__main__':
    np.random.seed(42)
    # Read challenge data
    X_challenge, y_challenge, X_heldout, feature_names = get_challenge_data()
    run_challenge(X_challenge, y_challenge, X_heldout)
    # test_challenge_output()

helper.py

# HW3 - helper.py

import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import yaml
config = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)

import hw3_main
import hw3_challenge

def get_train_test_split():
    """
    This function performs the following steps:
    - Reads in the data from data/labels.csv and data/files/*.csv (keep only the first 2,500 examples)
    - Generates a feature vector for each example
    - Aggregates feature vectors into a feature matrix (features are sorted alphabetically by name)
    - Performs imputation and normalization with respect to the population

    After all these steps, it splits the data into 80% train and 20% test.

    The binary labels take two values:
        -1: survivor
        +1: died in hospital

    Returns the features and labesl for train and test sets, followed by the names of features.
    """
    df_labels = pd.read_csv('data/labels.csv')
    df_labels = df_labels[:2500]
    IDs = df_labels['RecordID'][:2500]
    raw_data = {}
    for i in tqdm(IDs, desc='Loading files from disk'):
        raw_data[i] = pd.read_csv('data/files/{}.csv'.format(i))

    features = Parallel(n_jobs=16)(delayed(hw3_main.generate_feature_vector)(df) for _, df in tqdm(raw_data.items(), desc='Generating feature vectors'))
    df_features = pd.DataFrame(features).sort_index(axis=1)
    feature_names = df_features.columns.tolist()
    X, y = df_features.values, df_labels['In-hospital_death'].values
    X = hw3_main.impute_missing_values(X)
    X = hw3_main.normalize_feature_matrix(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=3)
    return X_train, y_train, X_test, y_test, feature_names

def get_classifier(penalty='l2', C=1.0, class_weight=None):
    """
    Returns a logistic regression classifier based on the given
    penalty function, regularization parameter C, and class weights.
    """
    if penalty == 'l2':
        return LogisticRegression(penalty=penalty, C=C, class_weight=class_weight, solver='lbfgs', max_iter=1000)
    elif penalty == 'l1':
        return LogisticRegression(penalty=penalty, C=C, class_weight=class_weight, solver='saga', max_iter=5000)
    else:
        raise ValueError('Error: unsupported configuration')

def get_challenge_data():
    """
    This function is similar to helper.get_train_test_split, except that:
    - It reads in all 10,000 training examples
    - It does not return labels for the 2,000 examples in the heldout test set
    You should replace your preprocessing functions (generate_feature_vector,
    impute_missing_values, normalize_feature_matrix) with updated versions for the challenge
    """
    df_labels = pd.read_csv('data/labels.csv')
    df_labels = df_labels
    IDs = df_labels['RecordID']
    raw_data = {}
    for i in tqdm(IDs, desc='Loading files from disk'):
        raw_data[i] = pd.read_csv('data/files/{}.csv'.format(i))

    features = Parallel(n_jobs=16)(delayed(hw3_challenge.generate_feature_vector_challenge)(df) for _, df in tqdm(raw_data.items(), desc='Generating feature vectors'))
    df_features = pd.DataFrame(features)
    feature_names = df_features.columns.tolist()
    X, y = df_features.values, df_labels['30-day_mortality'].values
    X = hw3_challenge.impute_missing_values_challenge(X)
    X = hw3_challenge.normalize_feature_matrix_challenge(X)
    return X[:10000], y[:10000], X[10000:], feature_names

def make_challenge_submission(y_label, y_score):
    """
    Takes in `y_label` and `y_score`, which are two list-like objects that contain
    both the binary predictions and raw scores from your classifier.
    Outputs the prediction to challenge.csv.

    Please make sure that you do not change the order of the test examples in the heldout set
    since we will use this file to evaluate your classifier.
    """
    print('Saving challenge output...')
    pd.DataFrame({'label': y_label.astype(int), 'risk_score': y_score}).to_csv('challenge.csv', index=False)
    print('challenge.csv saved')
    return

def test_challenge_output():
    import csv
    with open('challenge.csv', newline='') as csvfile:
        filereader = csv.reader(csvfile)
        i = 0
        for row in filereader:
            if i == 0:
                if row[0] != 'label':
                    print('INVALID COLUMN NAME: column name is not label.')
            else:
                rating = int(row[0])
                if rating != -1 and rating != 0 and rating != 1:
                    print('INVALID VALUE: values need to be -1, 0, or 1.')
            i += 1
        if i != 2001:
            print('INVALID NUMBER OF ROWS: number of rows is not 2001.')
        print('SUCCESS: csv file is valid.')
    return