Using the heart dataset from https://archive.ics.uci.edu/dataset/45/heart+disease we were given a task to execute proximity calculations among the patients and explore the relationship of the dataset’s attributes using plots.
def standard_dev(series_a):
'''
Takes in a vector and outputs the standard deviation value
:param series_a: pandas.core.series.Series
:return: numpy.float64
'''
sum = 0
a_mean = series_a.mean()
for val in series_a:
sum += (val - a_mean) ** 2
return np.sqrt(sum / (len(series_a) - 1))
def covariance(series_x, series_y):
'''
Takes two vectors of the same length/dim and outputs a covariance value
:param series_x: pandas.core.series.Series
:param series_y: pandas.core.series.Series
:return: numpy.float64
'''
if len(series_x) != len(series_y):
print("Error: Two vectors have different lengths")
return
sum = 0
x_mean, y_mean = series_x.mean(), series_y.mean()
for x,y in zip(series_x,series_y):
sum += (x - x_mean) * (y - y_mean)
return sum / (len(series_x) - 1)
def pearson_corr(df):
'''
Takes a (N x d) matrix dataframe and computes the pearson correlation coefficient for
all pairs of instances and returns a (M x M) matrix dataframe containing those values.
(pairs by rows)
:param df: pandas.core.frame.DataFrame (N rows, d columns)
:return: pandas.core.frame.DataFrame (N rows, N columns)
'''
n = df.shape[0]
return_matrix = pd.DataFrame(np.zeros((n,n)), index=df.index, columns=df.index)
for i in range(n):
for j in range(n):
x = df.iloc[i]
y = df.iloc[j]
return_matrix.iloc[i,j] = covariance(x,y) / ( standard_dev(x) * standard_dev(y) )
return return_matrix
def cosine_similarity(df):
'''
Takes a (N x d) matrix dataframe and computes the cosine similarity for all pairs of
instances (rows) and returns a (M x M) matrix dataframe containing those values.
:param df: pandas.core.frame.DataFrame (N rows, d columns)
:return: pandas.core.frame.DataFrame (N rows, N columns)
'''
n = df.shape[0]
return_matrix = pd.DataFrame(np.zeros((n, n)), index=df.index, columns=df.index)
for i in range(n):
for j in range(n):
x = df.iloc[i]
y = df.iloc[j]
# Cosine Similarity: dot(x, y) / (norm(x) * norm(y))
dot_product = np.dot(x, y)
norm_x = np.linalg.norm(x)
norm_y = np.linalg.norm(y)
return_matrix.iloc[i, j] = dot_product / (norm_x * norm_y)
return return_matrix
def euclidean_distance(df):
'''
Takes a (N x d) matrix dataframe and computes the Euclidean distance for all pairs
of instances (rows) and returns a (M x M) matrix dataframe containing those values.
:param df: pandas.core.frame.DataFrame (N rows, d columns)
:return: pandas.core.frame.DataFrame (N rows, N columns)
'''
n = df.shape[0]
return_matrix = pd.DataFrame(np.zeros((n, n)), index=df.index, columns=df.index)
for i in range(n):
for j in range(n):
x = df.iloc[i]
y = df.iloc[j]
# Euclidean Distance: sqrt(sum((x_i - y_i)^2))
dist = np.sqrt(np.sum((x - y) ** 2))
return_matrix.iloc[i, j] = dist
return return_matrix
Since it was a row by row proximity calculation, a 100 x 100 matrix was output by each functions
Couldn’t find much significance from the matricies though