Selecting non-numerical columns

Imports

import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

Create data

data = {'label': ['dog', 'cat', 'catdog', 'dog', 'catdog'], 'score': [1, 2, 3, 4, 5]}
df = pd.DataFrame(data, columns = ["label", "score"])
df

label score
0 dog 1
1 cat 2
2 catdog 3
3 dog 4
4 catdog 5

Define numerical columns

def get_non_numerical_columns(df):
    numerics = list(df.select_dtypes('number').columns)
    cols = list(df.columns)
    return [x for x in cols if x not in numerics]

non_numerics = get_non_numerical_columns(df)
print(non_numerics)
['label']

Create custom transformer (fit and transform methods)

class ColumnSelector(BaseEstimator, TransformerMixin):
    """Select only specified columns."""
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

Create numerical pipeline

cat_pipeline = Pipeline([('cat_selector', ColumnSelector(non_numerics))])

Fit pipeline

cat_pipeline.fit(df)
Pipeline(memory=None,
         steps=[('cat_selector', ColumnSelector(columns=['label']))],
         verbose=False)

Transform pipeline

cat_pipeline.transform(df)

label
0 dog
1 cat
2 catdog
3 dog
4 catdog

From