palmer.py | Notion

uses the palmer penguins dataset:
https://github.com/allisonhorst/palmerpenguins?tab=readme-ov-file
import pandas

"""
Pandas DataFrame Manipulation with Palmer Penguins Dataset
~~~~~~
Follow the instructions in the homework to complete the assignment.
"""

def load_csv(inputfile):
    """
    Load the csv as a pandas data frame

    Parameters
    ----------
    inputfile : string
        filename of the csv to load

    Returns
    -------
    csvdf : pandas.DataFrame
        return the pandas dataframe with the contents
        from the csv inputfile
    """
    # returns a dataframe type
    return pandas.read_csv(inputfile)
    

def remove_na(inputdf, colname):
    """
    Remove the rows in the dataframe with NA as values
    in the column specified.

    Parameters
    ----------
    inputdf : pandas.DataFrame
        Input dataframe
    colname : string
        Name of the column to check and remove rows with NA

    Returns
    -------
    outputdf : pandas.DataFrame
        return the pandas dataframe with the modified contents
    """
    return inputdf.dropna(subset=[colname])

def onehot(inputdf, colname):
    """
    Convert the column in the dataframe into a one hot encoding.
    The newly converted columns should be at the end of the data
    frame and you should also drop the original column.

    Parameters
    ----------
    inputdf : pandas.DataFrame
        Input dataframe
    colname : string
        Name of the column to one-hot encode

    Returns
    -------
    outputdf : pandas.DataFrame
        return the pandas dataframe with the modified contents
    """
    return pandas.get_dummies(inputdf, columns=[colname], dtype=int)

def to_numeric(inputdf):
    """
    Extract all the

    Parameters
    ----------
    inputdf : pandas.DataFrame
        Input dataframe

    Returns
    -------
    outputnp : numpy.ndarray
        return the numeric contents of the input dataframe as a
        numpy array
    """
    return inputdf.select_dtypes(include=['int64', 'float64']).to_numpy()

def main():
    # Load data
    df = load_csv("data/penguins.csv")

    # Remove NA
    df = remove_na(df, "species")

    # One hot encoding
    df = onehot(df, "species")

    # Convert to numeric
    df_np = to_numeric(df)

if __name__ == "__main__":
    main()