import numpy
import pandas
import timeit
import matplotlib.pyplot as plt

"""
Vectorization Comparison for Computing Sum of Squares
~~~~~~
Follow the instructions in the homework to complete the assignment.
"""

def gen_random_samples(n):
    """
    Generate n random samples using the
    numpy random.randn module.

    Returns
    ----------
    sample : 1d array of size n
        An array of n random samples
    """
    return numpy.random.randn(n)

def sum_squares_for(samples):
    """
    Compute the sum of squares using a forloop

    Parameters
    ----------
    samples : 1d-array with shape n
        An array of numbers.

    Returns
    -------
    ss : float
        The sum of squares of the samples
    """
    sum = 0
    for element in samples:
        sum += element**2
    return sum

def sum_squares_np(samples):
    """
    Compute the sum of squares using Numpy's dot module

    Parameters
    ----------
    samples : 1d-array with shape n
        An array of numbers.

    Returns
    -------
    ss : float
        The sum of squares of the samples
    """
    return numpy.dot(samples, samples)

# some_list = gen_random_samples(5)
# print(some_list)
# print(sum_squares_for(some_list))
# print(sum_squares_np(some_list))

def time_ss(sample_list):
    """
    Time it takes to compute the sum of squares
    for varying number of samples. The function should
    generate a random sample of length s (where s is an
    element in sample_list), and then time the same random
    sample using the for and numpy loops.

    Parameters
    ----------
    samples : list of length n
        A list of integers to .

    Returns
    -------
    ss_dict : Python dictionary with 3 keys: n, ssfor, ssnp.
        The value for each key should be a list, where the
        ordering of the list follows the sample_list order
        and the timing in seconds associated with that
        number of samples.
    """

    return_dict = {"n":[], "ssfor":[], "ssnp":[]}

    for array_length in sample_list:

        return_dict["n"].append(array_length)
        sample_array = gen_random_samples(array_length)

        start = timeit.default_timer()
        sum_squares_for(sample_array)
        elapsed = timeit.default_timer() - start
        return_dict["ssfor"].append(elapsed)

        start = timeit.default_timer()
        sum_squares_np(sample_array)
        elapsed = timeit.default_timer() - start
        return_dict["ssnp"].append(elapsed)

    return return_dict

def timess_to_df(ss_dict):
    """
    Time the time it takes to compute the sum of squares
    for varying number of samples.

    Parameters
    ----------
    ss_dict : Python dictionary with 3 keys: n, ssfor, ssnp.
        The value for each key should be a list, where the
        ordering of the list follows the sample_list order
        and the timing in seconds associated with that
        number of samples.

    Returns
    -------
    time_df : Pandas dataframe that has n rows and 3 columns.
        The column names must be n, ssfor, ssnp and follow that order.
        ssfor and ssnp should contain the time in seconds.
    """
    return pandas.DataFrame(ss_dict)

# (f) compare two types of code for n ranging from 10 to 10,000,000

sample_list = [10, 100, 1000, 10000, 100000, 1000000, 10000000]

df = timess_to_df((time_ss(sample_list)))

plt.scatter(df['n'], df['ssfor'], color='green', label='ssfor', marker='s')
plt.xscale('log')
plt.scatter(df['n'], df['ssnp'], color='red', label='ssnp', marker='o')
plt.yscale('log')
plt.xlabel('n')
plt.ylabel('seconds')
plt.legend()
plt.show()

print(df)

def main():
    # generate 100 samples
    samples = gen_random_samples(100)
    # call the for version
    ss_for = sum_squares_for(samples)
    # call the numpy version
    ss_np = sum_squares_np(samples)
    # make sure they are approximately the same value
    import numpy.testing as npt
    npt.assert_almost_equal(ss_for, ss_np, decimal=5)

if __name__ == "__main__":
    main()

Plot from (f):

vectorization_comparison.png