Preprocessing¶
In [1]:
Copied!
# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'
# Uncomment the next line and run this cell to install sorix
#!pip install 'sorix @ git+https://github.com/Mitchell-Mirano/sorix.git@main'
In [2]:
Copied!
import numpy as np
import pandas as pd
from sorix.preprocessing import OneHotEncoder
from sorix.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sorix.preprocessing import ColumnTransformer
import numpy as np
import pandas as pd
from sorix.preprocessing import OneHotEncoder
from sorix.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sorix.preprocessing import ColumnTransformer
In [3]:
Copied!
data = {'Edad': [25, 30, 45, 50, 35, 60, 20, 40],
'Ingresos': [30000, 50000, 100000, 120000, 70000, 150000, 20000, 80000],
'Pais': ['EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá'],
'Ciudades': ['New York', 'Toronto', 'Mexico City', 'New York', 'Toronto', 'Mexico City', 'New York', 'Toronto'],
'Compra': [0, 1, 1, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data)
X = df.drop('Compra', axis=1)
y = df['Compra']
X
data = {'Edad': [25, 30, 45, 50, 35, 60, 20, 40],
'Ingresos': [30000, 50000, 100000, 120000, 70000, 150000, 20000, 80000],
'Pais': ['EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá', 'México', 'EE. UU.', 'Canadá'],
'Ciudades': ['New York', 'Toronto', 'Mexico City', 'New York', 'Toronto', 'Mexico City', 'New York', 'Toronto'],
'Compra': [0, 1, 1, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data)
X = df.drop('Compra', axis=1)
y = df['Compra']
X
Out[3]:
| Edad | Ingresos | Pais | Ciudades | |
|---|---|---|---|---|
| 0 | 25 | 30000 | EE. UU. | New York |
| 1 | 30 | 50000 | Canadá | Toronto |
| 2 | 45 | 100000 | México | Mexico City |
| 3 | 50 | 120000 | EE. UU. | New York |
| 4 | 35 | 70000 | Canadá | Toronto |
| 5 | 60 | 150000 | México | Mexico City |
| 6 | 20 | 20000 | EE. UU. | New York |
| 7 | 40 | 80000 | Canadá | Toronto |
In [4]:
Copied!
X.shape
X.shape
Out[4]:
(8, 4)
In [5]:
Copied!
categorical_features = ['Pais', 'Ciudades']
numeric_features = ['Edad', 'Ingresos']
categorical_features = ['Pais', 'Ciudades']
numeric_features = ['Edad', 'Ingresos']
One Hot Encoder¶
In [6]:
Copied!
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[categorical_features])
pd.DataFrame(X_encoded, columns=encoder.get_features_names())
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[categorical_features])
pd.DataFrame(X_encoded, columns=encoder.get_features_names())
Out[6]:
| Pais_Canadá | Pais_EE. UU. | Pais_México | Ciudades_Mexico City | Ciudades_New York | Ciudades_Toronto | |
|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 5 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 |
| 7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
Standard Scaler¶
In [7]:
Copied!
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numeric_features])
pd.DataFrame(X_scaled, columns=scaler.get_features_names())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numeric_features])
pd.DataFrame(X_scaled, columns=scaler.get_features_names())
Out[7]:
| Edad | Ingresos | |
|---|---|---|
| 0 | -1.051315 | -1.137500 |
| 1 | -0.650814 | -0.658553 |
| 2 | 0.550689 | 0.538816 |
| 3 | 0.951190 | 1.017763 |
| 4 | -0.250313 | -0.179605 |
| 5 | 1.752192 | 1.736185 |
| 6 | -1.451816 | -1.376974 |
| 7 | 0.150188 | 0.059868 |
One Hot Ecoder + Standard Scaler¶
In [8]:
Copied!
encoder = OneHotEncoder()
scaler = StandardScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
encoder = OneHotEncoder()
scaler = StandardScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
Out[8]:
| Pais_Canadá | Pais_EE. UU. | Pais_México | Ciudades_Mexico City | Ciudades_New York | Ciudades_Toronto | Edad | Ingresos | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.051315 | -1.137500 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.650814 | -0.658553 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.550689 | 0.538816 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.951190 | 1.017763 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.250313 | -0.179605 |
| 5 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.752192 | 1.736185 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.451816 | -1.376974 |
| 7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.150188 | 0.059868 |
One Hot Ecoder + MinMax Scaler¶
In [9]:
Copied!
encoder = OneHotEncoder()
scaler = MinMaxScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
encoder = OneHotEncoder()
scaler = MinMaxScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
Out[9]:
| Pais_Canadá | Pais_EE. UU. | Pais_México | Ciudades_Mexico City | Ciudades_New York | Ciudades_Toronto | Edad | Ingresos | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.125 | 0.076923 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.250 | 0.230769 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.625 | 0.615385 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.750 | 0.769231 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.375 | 0.384615 |
| 5 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.000 | 1.000000 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.000 | 0.000000 |
| 7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.500 | 0.461538 |
One Hot Ecoder + Robust Scaler¶
In [10]:
Copied!
encoder = OneHotEncoder()
scaler = RobustScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
encoder = OneHotEncoder()
scaler = RobustScaler()
X_encoded = encoder.fit_transform(X[categorical_features])
X_scaled = scaler.fit_transform(X[numeric_features])
X_train = np.hstack((X_encoded, X_scaled))
pd.DataFrame(X_train, columns=encoder.get_features_names() + scaler.get_features_names())
Out[10]:
| Pais_Canadá | Pais_EE. UU. | Pais_México | Ciudades_Mexico City | Ciudades_New York | Ciudades_Toronto | Edad | Ingresos | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -0.714286 | -0.750000 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.428571 | -0.416667 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.428571 | 0.416667 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.714286 | 0.750000 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.142857 | -0.083333 |
| 5 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.285714 | 1.250000 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.000000 | -0.916667 |
| 7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.142857 | 0.083333 |
Column Transformer¶
In [11]:
Copied!
column_transformer = ColumnTransformer(
transformers = [
('cat', OneHotEncoder(), categorical_features),
('num', StandardScaler(), numeric_features)
]
)
X_train = column_transformer.fit_transform(X)
pd.DataFrame(X_train, columns=column_transformer.get_features_names())
column_transformer = ColumnTransformer(
transformers = [
('cat', OneHotEncoder(), categorical_features),
('num', StandardScaler(), numeric_features)
]
)
X_train = column_transformer.fit_transform(X)
pd.DataFrame(X_train, columns=column_transformer.get_features_names())
Out[11]:
| cat_Pais_Canadá | cat_Pais_EE. UU. | cat_Pais_México | cat_Ciudades_Mexico City | cat_Ciudades_New York | cat_Ciudades_Toronto | num_Edad | num_Ingresos | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.051315 | -1.137500 |
| 1 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.650814 | -0.658553 |
| 2 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.550689 | 0.538816 |
| 3 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.951190 | 1.017763 |
| 4 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | -0.250313 | -0.179605 |
| 5 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 1.752192 | 1.736185 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | -1.451816 | -1.376974 |
| 7 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.150188 | 0.059868 |