import numpy
import pandas
import timeit
import matplotlib.pyplot as plt
"""
Vectorization Comparison for Computing Sum of Squares
~~~~~~
Follow the instructions in the homework to complete the assignment.
"""
def gen_random_samples(n):
"""
Generate n random samples using the
numpy random.randn module.
Returns
----------
sample : 1d array of size n
An array of n random samples
"""
return numpy.random.randn(n)
def sum_squares_for(samples):
"""
Compute the sum of squares using a forloop
Parameters
----------
samples : 1d-array with shape n
An array of numbers.
Returns
-------
ss : float
The sum of squares of the samples
"""
sum = 0
for element in samples:
sum += element**2
return sum
def sum_squares_np(samples):
"""
Compute the sum of squares using Numpy's dot module
Parameters
----------
samples : 1d-array with shape n
An array of numbers.
Returns
-------
ss : float
The sum of squares of the samples
"""
return numpy.dot(samples, samples)
# some_list = gen_random_samples(5)
# print(some_list)
# print(sum_squares_for(some_list))
# print(sum_squares_np(some_list))
def time_ss(sample_list):
"""
Time it takes to compute the sum of squares
for varying number of samples. The function should
generate a random sample of length s (where s is an
element in sample_list), and then time the same random
sample using the for and numpy loops.
Parameters
----------
samples : list of length n
A list of integers to .
Returns
-------
ss_dict : Python dictionary with 3 keys: n, ssfor, ssnp.
The value for each key should be a list, where the
ordering of the list follows the sample_list order
and the timing in seconds associated with that
number of samples.
"""
return_dict = {"n":[], "ssfor":[], "ssnp":[]}
for array_length in sample_list:
return_dict["n"].append(array_length)
sample_array = gen_random_samples(array_length)
start = timeit.default_timer()
sum_squares_for(sample_array)
elapsed = timeit.default_timer() - start
return_dict["ssfor"].append(elapsed)
start = timeit.default_timer()
sum_squares_np(sample_array)
elapsed = timeit.default_timer() - start
return_dict["ssnp"].append(elapsed)
return return_dict
def timess_to_df(ss_dict):
"""
Time the time it takes to compute the sum of squares
for varying number of samples.
Parameters
----------
ss_dict : Python dictionary with 3 keys: n, ssfor, ssnp.
The value for each key should be a list, where the
ordering of the list follows the sample_list order
and the timing in seconds associated with that
number of samples.
Returns
-------
time_df : Pandas dataframe that has n rows and 3 columns.
The column names must be n, ssfor, ssnp and follow that order.
ssfor and ssnp should contain the time in seconds.
"""
return pandas.DataFrame(ss_dict)
# (f) compare two types of code for n ranging from 10 to 10,000,000
sample_list = [10, 100, 1000, 10000, 100000, 1000000, 10000000]
df = timess_to_df((time_ss(sample_list)))
plt.scatter(df['n'], df['ssfor'], color='green', label='ssfor', marker='s')
plt.xscale('log')
plt.scatter(df['n'], df['ssnp'], color='red', label='ssnp', marker='o')
plt.yscale('log')
plt.xlabel('n')
plt.ylabel('seconds')
plt.legend()
plt.show()
print(df)
def main():
# generate 100 samples
samples = gen_random_samples(100)
# call the for version
ss_for = sum_squares_for(samples)
# call the numpy version
ss_np = sum_squares_np(samples)
# make sure they are approximately the same value
import numpy.testing as npt
npt.assert_almost_equal(ss_for, ss_np, decimal=5)
if __name__ == "__main__":
main()
Plot from (f):