# Data Profiling and Preparation
## Chapter I: Data Profiling

---

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = None

# Structure Discovery

In [0]:
# Read data
telco_churn = pd.read_csv("/Workspace/Users/supawit.somsa@kmutt.ac.th/Telco-Churn.csv")

In [0]:
telco_churn.head()

## Checking dimension

In [0]:
telco_churn.shape

## Checking columns

In [0]:
telco_churn.columns

## Checking data type

In [0]:
telco_churn.info()

In [0]:
telco_churn.dtypes

In [0]:
telco_churn["customerID"][0]

In [0]:
type(telco_churn["customerID"][0])

## Fixing data types

In [0]:
# Re-read data with `dtype` parameter
telco_churn = pd.read_csv("/Workspace/Users/supawit.somsa@kmutt.ac.th/Telco-Churn.csv", dtype={"SeniorCitizen":str})

In [0]:
# Cast column
telco_churn["SeniorCitizen"] = telco_churn["SeniorCitizen"].astype(str)

# Univariate Analysis

## Categorical variables

### Non-graphical approach

Frequency Table

In [0]:
# Count occurrences in each category
pd.crosstab(telco_churn["InternetService"], columns="Count")

In [0]:
# Calculate proportion of data in each category
pd.crosstab(telco_churn["InternetService"], columns="P", normalize=True)

### Graphical approach

Countplot

In [0]:
plt.figure(figsize=(6,3))
sns.countplot(data=telco_churn, y="InternetService")
plt.show()

Count plot with custom order

In [0]:
plt.figure(figsize=(6,3))
sns.countplot(data=telco_churn, 
              y="InternetService", 
              order=["No", "DSL", "Fiber optic"]
              )
plt.show()

Annotate the bar

In [0]:
def annotate_vertical_plot(ax):
    for p in ax.patches:
        value = p.get_height()
        position = (p.get_x()+p.get_width()/2, p.get_height())
        ax.annotate(f"{value:.0f}", position, ha="center", va="bottom")
def annotate_horizontal_plot(ax):
    for p in ax.patches:
        value = p.get_width()
        position = (p.get_width(), p.get_y()+p.get_height()/2)
        ax.annotate(f"{value:.0f}", position, ha="left", va="center", rotation=-90)

In [0]:
plt.figure(figsize=(6,3))
ax = sns.countplot(data=telco_churn, y="InternetService")
annotate_horizontal_plot(ax)
plt.show()

## Numerical variable

### Non-graphical approach

.describe

In [0]:
telco_churn.describe().round(decimals=2)

### Graphical approach

Histogram

In [0]:
# Using Pandas's built-in function
plt.figure(figsize=(5,4))
telco_churn["tenure"].plot.hist(bins=30)
plt.show()

In [0]:
# Using Seaborn
plt.figure(figsize=(5,4))
sns.histplot(data=telco_churn, x="tenure", bins=30)
plt.show()

Box and Whiskers plot

In [0]:
plt.figure(figsize=(10,2))
sns.boxplot(data=telco_churn, x="MonthlyCharges")
plt.show()

# Multivariate Analysis

## Categorical vs Categorical

### Non-graphical approach

Multi-variable Frequency Table

In [0]:
pd.crosstab(index=telco_churn["Contract"], 
            columns=telco_churn["Churn"]
            )

In [0]:
# Normalize across `Churn`
pd.crosstab(index=telco_churn["Contract"], 
            columns=telco_churn["Churn"],
            normalize="index"
            )

In [0]:
# Normalize across `Contract`
pd.crosstab(index=telco_churn["Contract"], 
            columns=telco_churn["Churn"],
            normalize="columns"
            )

### Graphical approach

Count plot with hue

In [0]:
plt.figure(figsize=(5,3))
sns.countplot(data=telco_churn, x="Contract", hue="Churn")
plt.show()

## Categorical vs Numerical

### Non-graphical

groupby + aggregate

In [0]:
telco_churn.groupby("Contract").agg(Median_monthly_charges=("MonthlyCharges", "median"),
                                    Mean_monthly_charges=("MonthlyCharges", "mean")
                                  )

In [0]:
telco_churn.groupby(["Contract", "Churn"])\
    .agg(Median_monthly_charges=("MonthlyCharges", "median"))

### Graphical approach

#### Nominal vs Numerical

Bar plot (aggregated)

In [0]:
plt.figure(figsize=(5,3))
sns.barplot(data=telco_churn, y="Contract", x="MonthlyCharges", estimator=np.median)
plt.show()

Box plot

In [0]:
plt.figure(figsize=(7,2))
sns.boxplot(data=telco_churn, 
            y="Contract", 
            x="MonthlyCharges"
           )
plt.show()

#### Ordinal vs Numerical

Line plot

In [0]:
# Read data
weather_data = pd.read_csv("/Workspace/Users/supawit.somsa@kmutt.ac.th/hourly_weather.csv")
weather_data.head()

In [0]:
weather_data["date"] = weather_data["datetime"].str[:10]
daily_weather = weather_data.groupby(["city", "date"], as_index=False).agg(Mean_daily_temp=("temp_c", "mean"))

In [0]:
weather_bangkok = daily_weather[daily_weather["city"]=="Bangkok"]
plt.figure(figsize=(10,4))
ax = sns.lineplot(data=weather_bangkok, x="date", y="Mean_daily_temp")
plt.xticks(rotation=90)
plt.show()

#### Categotical vs Distribution

Stylized Histogram

In [0]:
plt.figure(figsize=(7,4))
sns.histplot(data=telco_churn, x="MonthlyCharges", hue="PaperlessBilling", bins=50)
plt.show()

## Numerical vs Numerical

### Non-graphical approach

Correlation

In [0]:
# Read data
mtcars = pd.read_csv("/Workspace/Users/supawit.somsa@kmutt.ac.th/mtcars.csv")

In [0]:
mtcars.head()

In [0]:
mtcars[["mpg", "cyl", "disp", "hp", "drat", "wt", "qsec"]].corr()

In [0]:
sns.heatmap(mtcars[["mpg", "cyl", "disp", "hp", "drat", "wt", "qsec"]].corr(), cmap="PuRd", annot=True)
plt.show()

### Graphical approach

Scatterplot

In [0]:
plt.figure(figsize=(5,3))
sns.scatterplot(data=mtcars, x="mpg", y="hp")
plt.show()

In [0]:
plt.figure(figsize=(5,3))
sns.regplot(data=mtcars, x="mpg", y="hp")
plt.show()

# Plot controls

## Title

In [0]:
plt.figure(figsize=(5,3))
sns.regplot(data=mtcars, x="mpg", y="hp")
plt.title("Relationship between `mpg` and `hp`")
plt.show()

## Axis label

In [0]:
plt.figure(figsize=(5,3))
sns.regplot(data=mtcars, x="mpg", y="hp")
plt.title("Relationship between `mpg` and `hp`")
plt.xlabel("Efficiency (mpg)")
plt.ylabel("Horsepower (hp)")
plt.show()

## Axis ticks

In [0]:
plt.figure(figsize=(6,3))
ax = sns.countplot(data=telco_churn, x="SeniorCitizen")
plt.title("Distribution of `SeniorCitizen`")
plt.ylabel("Count")
ax.set_xticklabels(["No", "Yes"])
plt.show()

## Fonts

In [0]:
# Add Thai font to Matplotlib font library
from matplotlib.font_manager import fontManager, FontProperties

path = "/Workspace/Users/supawit.somsa@kmutt.ac.th/Kanit-Regular.ttf"
fontManager.addfont(path)

prop = FontProperties(fname=path)
sns.set(font=prop.get_name(), font_scale=1.2)

In [0]:
plt.figure(figsize=(5,3))
sns.regplot(data=mtcars, x="mpg", y="hp")
plt.title("ความสัมพันธ์ระหว่างประสิทธิภาพและกำลังของเครื่องยนต์")
plt.xlabel("ประสิทธิภาพ (mpg)")
plt.ylabel("กำลัง (hp)")
plt.show()

## Color palette

In [0]:
plt.figure(figsize=(6,3))
sns.countplot(data=telco_churn, x="gender")
plt.show()

Using color name

In [0]:
plt.figure(figsize=(6,3))
sns.countplot(data=telco_churn, x="gender", palette=["salmon", "navy"])
plt.show()

Using color code

In [0]:
plt.figure(figsize=(6,3))
sns.countplot(data=telco_churn, x="gender", palette=["#F55B65", "#63ACF5"])
plt.show()

## Subplots

In [0]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(10,3))
sns.countplot(data=telco_churn, 
              x="gender", 
              palette=["#F55B65", "#63ACF5"], 
              ax=axs[0]
             )
sns.countplot(data=telco_churn, 
              x="InternetService", 
              palette=["green"],
              ax=axs[1]
             )
plt.show()

---