Combining columns together

Imports

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

Create data

data = {'label': ['dog', 'cat', 'catdog', 'dog', 'catdog'], 'score': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data, columns = ["label", "score"])
df

label score
0 dog 1
1 cat 2
2 catdog 3
3 dog 4
4 catdog 5

Define numerical columns

def get_non_numerical_columns(df):
    numerics = list(df.select_dtypes('number').columns)
    cols = list(df.columns)
    return [x for x in cols if x not in numerics]

def get_numerical_columns(df): 
    return list(df.select_dtypes('number').columns)

non_numerics = get_non_numerical_columns(df)
numerics = get_numerical_columns(df)

Create custom transformer (fit and transform methods)

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

Create pipeline

cat_pipeline = Pipeline([('cat_selector', ColumnSelector(non_numerics))])
num_pipeline = Pipeline([('num_selector', ColumnSelector(numerics))])

# Syntax: name, transformer, column
full_pipeline = ColumnTransformer([
    ('cat', cat_pipeline, non_numerics),
    ('num', num_pipeline, numerics)
])

Fit pipeline

full_pipeline.fit(df)
ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat',
                                 Pipeline(memory=None,
                                          steps=[('cat_selector',
                                                  ColumnSelector(columns=['label']))],
                                          verbose=False),
                                 ['label']),
                                ('num',
                                 Pipeline(memory=None,
                                          steps=[('num_selector',
                                                  ColumnSelector(columns=['score']))],
                                          verbose=False),
                                 ['score'])],
                  verbose=False)

Transform pipeline

full_pipeline.transform(df)
array([['dog', 1],
       ['cat', 2],
       ['catdog', 3],
       ['dog', 4],
       ['catdog', 5]], dtype=object)

From

  • Hands on Machine learning book