EDA Sample

import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.style as plt_styl

import warnings

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 30)
pd.set_option("display.width", 150)
pd.set_option("display.float_format", lambda x: "%.5f" % x)
warnings.simplefilter(action="ignore")

data_set_conn = {"connection_type": "WORKER_FILE", "file_path": "/home/ubuntu/samples/data/insurance.csv"}

import practicuscore as prt

worker = prt.get_local_worker()

proc = worker.load(data_set_conn, engine="AUTO")

df = proc.get_df_copy()
display(df)

df.info()

def grab_col_names(dataframe, cat_th=10, car_th=25, show_date=False):
    date_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "datetime64[ns]"]
    cat_cols = dataframe.select_dtypes(["object", "category"]).columns.tolist()
    num_but_cat = [
        col for col in dataframe.select_dtypes(["float", "integer"]).columns if dataframe[col].nunique() < cat_th
    ]
    cat_but_car = [
        col for col in dataframe.select_dtypes(["object", "category"]).columns if dataframe[col].nunique() > car_th
    ]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = dataframe.select_dtypes(["float", "integer"]).columns
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"date_cols: {len(date_cols)}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    if show_date == True:
        return date_cols, cat_cols, cat_but_car, num_cols, num_but_cat
    else:
        return cat_cols, cat_but_car, num_cols, num_but_cat

grab_col_names(df)

cat_cols, cat_but_car, num_cols, num_but_cat = grab_col_names(df)

df.head()

df[(df["region"] == 3)]

print(cat_cols)

def cat_analyzer(dataframe, variable, target=None):
    print(variable)
    if target == None:
        print(
            pd.DataFrame(
                {
                    "COUNT": dataframe[variable].value_counts(),
                    "RATIO": dataframe[variable].value_counts() / len(dataframe),
                }
            ),
            end="\n\n\n",
        )
    else:
        temp = dataframe[dataframe[target].isnull() == False]
        print(
            pd.DataFrame(
                {
                    "COUNT": dataframe[variable].value_counts(),
                    "RATIO": dataframe[variable].value_counts() / len(dataframe),
                    "TARGET_COUNT": dataframe.groupby(variable)[target].count(),
                    "TARGET_MEAN": temp.groupby(variable)[target].mean(),
                    "TARGET_MEDIAN": temp.groupby(variable)[target].median(),
                    "TARGET_STD": temp.groupby(variable)[target].std(),
                }
            ),
            end="\n\n\n",
        )

cat_analyzer(df, "region")

df[num_cols].hist(figsize=(25, 20), bins=15);

df[num_cols].describe([0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.95, 0.99]).T.drop(["count"], axis=1)

def outliers_threshold(dataframe, column):
    q1 = dataframe[column].quantile(0.05)
    q3 = dataframe[column].quantile(0.95)
    inter_quartile_range = q3 - q1
    low = q1 - 1.5 * inter_quartile_range
    up = q3 + 1.5 * inter_quartile_range
    return low, up


def grab_outlier(dataframe, column, index=False):
    low, up = outliers_threshold(dataframe, column)
    if dataframe[(dataframe[column] < low) | (dataframe[column] > up)].shape[0] < 10:
        print(dataframe[(dataframe[column] < low) | (dataframe[column] > up)][[column]])
    else:
        print(dataframe[(dataframe[column] < low) | (dataframe[column] > up)][[column]])
    if index:
        outlier_index = dataframe[(dataframe[column] < low) | (dataframe[column] > up)].index.tolist()
        return outlier_index


def replace_with_thresholds(dataframe, col_name):
    low_limit, up_limit = outliers_threshold(dataframe, col_name)
    if low_limit > 0:
        dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
    else:
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit

df[df["age"] <= 64]["age"].plot(kind="box")

for col in num_cols:
    print(
        "********************************************************************* {} *****************************************************************************".format(
            col.upper()
        )
    )
    grab_outlier(df, col, True)
    replace_with_thresholds(df, col)
    print(
        "****************************************************************************************************************************************************************",
        end="\n\n\n\n\n",
    )

import matplotlib.pyplot as plt

df.columns

cat_cols

df.head()

plt.figure(figsize=(30, 20))
corr_matrix = df.select_dtypes(include=["int64", "int32", "float64"]).corr()
sns.heatmap(corr_matrix, annot=True, cmap="Reds")
plt.title("Correlation Heatmap")

def cat_summary(dataframe, x_col, plot=False, rotation=45):
    display(
        pd.DataFrame(
            {x_col: dataframe[x_col].value_counts(), "Ratio": 100 * dataframe[x_col].value_counts() / len(dataframe)}
        )
    )

    if plot:
        count = dataframe.groupby(x_col).size().sum()
        dataframe_grouped = (
            dataframe.groupby(x_col).size().reset_index(name="counts").sort_values("counts", ascending=False)
        )
        num_bars = len(dataframe_grouped[x_col].unique())
        colors = plt.cm.Set3(np.linspace(0, 1, num_bars))
        fig, ax = plt.subplots(figsize=(8, 5))

        x_pos = range(len(dataframe_grouped[x_col]))

        ax.bar(x_pos, dataframe_grouped["counts"], color=colors)
        ax.set_xlabel(x_col)
        ax.set_ylabel("Count")
        ax.set_title(f"Distribution by {x_col}")

        ax.set_xticks(x_pos)
        ax.set_xticklabels(dataframe_grouped[x_col], rotation=rotation)

        for i, value in enumerate(dataframe_grouped["counts"]):
            ax.annotate(
                "{:.1%}".format(value / count), (i, value), textcoords="offset points", xytext=(0, 10), ha="center"
            )

        plt.show()

for col in cat_cols:
    cat_summary(df, col, plot=True)

proc.kill()

Previous: Multiple Layers | Next: Data Processing > Pre Process Data > Preprocess