Using the heart dataset from https://archive.ics.uci.edu/dataset/45/heart+disease we were given a task to execute proximity calculations among the patients and explore the relationship of the dataset’s attributes using plots.

Heart Dataset

스크린샷 2025-02-12 오후 2.07.58.png

Pearson’s Correlation Coefficient

def standard_dev(series_a):
    '''
    Takes in a vector and outputs the standard deviation value

    :param series_a: pandas.core.series.Series
    :return: numpy.float64
    '''
    sum = 0
    a_mean = series_a.mean()
    for val in series_a:
        sum += (val - a_mean) ** 2
    return np.sqrt(sum / (len(series_a) - 1))

def covariance(series_x, series_y):
    '''
    Takes two vectors of the same length/dim and outputs a covariance value

    :param series_x: pandas.core.series.Series
    :param series_y: pandas.core.series.Series
    :return: numpy.float64
    '''
    if len(series_x) != len(series_y):
        print("Error: Two vectors have different lengths")
        return
    sum = 0
    x_mean, y_mean = series_x.mean(), series_y.mean()
    for x,y in zip(series_x,series_y):
        sum += (x - x_mean) * (y - y_mean)
    return sum / (len(series_x) - 1)

def pearson_corr(df):
    '''
    Takes a (N x d) matrix dataframe and computes the pearson correlation coefficient for
    all pairs of instances and returns a (M x M) matrix dataframe containing those values.
    (pairs by rows)

    :param df: pandas.core.frame.DataFrame (N rows, d columns)
    :return: pandas.core.frame.DataFrame (N rows, N columns)
    '''
    n = df.shape[0]
    return_matrix = pd.DataFrame(np.zeros((n,n)), index=df.index, columns=df.index)

    for i in range(n):
        for j in range(n):
            x = df.iloc[i]
            y = df.iloc[j]
            return_matrix.iloc[i,j] = covariance(x,y) / ( standard_dev(x) * standard_dev(y) )

    return return_matrix

Cosine Similarity

def cosine_similarity(df):
    '''
    Takes a (N x d) matrix dataframe and computes the cosine similarity for all pairs of
    instances (rows) and returns a (M x M) matrix dataframe containing those values.

    :param df: pandas.core.frame.DataFrame (N rows, d columns)
    :return: pandas.core.frame.DataFrame (N rows, N columns)
    '''
    n = df.shape[0]
    return_matrix = pd.DataFrame(np.zeros((n, n)), index=df.index, columns=df.index)

    for i in range(n):
        for j in range(n):
            x = df.iloc[i]
            y = df.iloc[j]
            # Cosine Similarity: dot(x, y) / (norm(x) * norm(y))
            dot_product = np.dot(x, y)
            norm_x = np.linalg.norm(x)
            norm_y = np.linalg.norm(y)
            return_matrix.iloc[i, j] = dot_product / (norm_x * norm_y)

    return return_matrix

Euclidean Distance

def euclidean_distance(df):
    '''
    Takes a (N x d) matrix dataframe and computes the Euclidean distance for all pairs
    of instances (rows) and returns a (M x M) matrix dataframe containing those values.

    :param df: pandas.core.frame.DataFrame (N rows, d columns)
    :return: pandas.core.frame.DataFrame (N rows, N columns)
    '''
    n = df.shape[0]
    return_matrix = pd.DataFrame(np.zeros((n, n)), index=df.index, columns=df.index)

    for i in range(n):
        for j in range(n):
            x = df.iloc[i]
            y = df.iloc[j]
            # Euclidean Distance: sqrt(sum((x_i - y_i)^2))
            dist = np.sqrt(np.sum((x - y) ** 2))
            return_matrix.iloc[i, j] = dist

    return return_matrix

Since it was a row by row proximity calculation, a 100 x 100 matrix was output by each functions

Couldn’t find much significance from the matricies though