Skip to content

API Reference

Ingredients

Wrapper around either polars.DataFrame to store columns roles (e.g., predictor) Due to the workings of polars, we do not subclass pl.dataframe anymore, but instead store the dataframe as an attribute. Args: roles: roles of DataFrame columns as (list of) strings. Defaults to None. check_roles: If set to false, doesn't check whether the roles match existing columns. Defaults to True.

See also: pandas.DataFrame

Attributes:

Name Type Description
roles dict

dictionary of column roles

Source code in recipies/ingredients.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
class Ingredients:
    """Wrapper around either polars.DataFrame to store columns roles (e.g., predictor)
        Due to the workings of polars, we do not subclass pl.dataframe anymore,
        but instead store the dataframe as an attribute.
    Args:
        roles: roles of DataFrame columns as (list of) strings.
            Defaults to None.
        check_roles: If set to false, doesn't check whether the roles match existing columns.
            Defaults to True.

    See also: pandas.DataFrame

    Attributes:
        roles (dict): dictionary of column roles
    """

    _metadata = ["roles"]

    def __init__(
        self,
        data: pl.DataFrame | pd.DataFrame = None,
        copy: bool = None,
        roles: dict = None,
        check_roles: bool = True,
        backend: Backend = None,
    ):
        if backend is None:
            if isinstance(data, pl.DataFrame):
                self.backend = Backend.POLARS
            elif isinstance(data, pd.DataFrame):
                self.backend = Backend.PANDAS
            elif isinstance(data, Ingredients):
                self.backend = data.get_backend()
            else:
                raise ValueError("Backend not specified and could not be inferred from data.")
        else:
            self.backend = backend
        if isinstance(data, pd.DataFrame) or isinstance(data, pl.DataFrame):
            if self.backend == Backend.POLARS:
                if isinstance(data, pd.DataFrame):
                    self.data = pl.DataFrame(data)
                elif isinstance(data, pl.DataFrame):
                    self.data = data
                else:
                    raise TypeError(f"Expected DataFrame, got {data.__class__}")
            elif self.backend == Backend.PANDAS:
                if isinstance(data, pd.DataFrame):
                    self.data = data
                if isinstance(data, pl.DataFrame):
                    self.data = data.to_pandas()
            else:
                raise ValueError(f"Backend {self.backend} not supported.")
            self.schema = self.get_schema()
            self.dtypes = self.get_schema()

        if isinstance(data, Ingredients) and roles is None:
            if copy is None or copy is True:
                self.roles = deepcopy(data.roles)
            else:
                self.roles = data.roles
            self.data = data.data
            self.schema = data.schema
            self.dtypes = self.schema

        elif roles is None:
            self.roles = {}
        elif not isinstance(roles, dict):
            raise TypeError(f"Expected dict object for roles, got {roles.__class__}")
        elif check_roles and not all(k in self.data.columns for k in roles.keys()):
            raise ValueError(
                "Roles contains variable names that are not in the data. "
                f"Got roles for {list(roles.keys())}, available columns: {list(self.data.columns)}"
            )
        else:
            if copy is None or copy is True:
                self.roles = deepcopy(roles)
            else:
                self.roles = roles

    @property
    def _constructor(self):
        return Ingredients

    @property
    def columns(self):
        return self.data.columns

    def to_df(self, output_format=None) -> pl.DataFrame:
        """Return the underlying DataFrame.


        Returns:
            Self as DataFrame.
        """
        if output_format == Backend.POLARS:
            if self.backend == Backend.POLARS:
                return self.data
            else:
                return pl.DataFrame(self.data)
        elif output_format == Backend.PANDAS:
            if self.backend == Backend.POLARS:
                return self.data.to_pandas()
            else:
                return self.data
        else:
            return self.data

    def _check_column(self, column):
        if not isinstance(column, str):
            raise ValueError(f"Expected string, got {column}")
        if column not in self.columns:
            raise ValueError(f"{column} does not exist in this Data object")

    def _check_role(self, new_role):
        if not isinstance(new_role, str):
            raise TypeError(f"new_role must be string, was {new_role.__class__}")

    def add_role(self, column: str, new_role: str):
        """Adds an additional role for a column that already has roles.

        Args:
            column: The column to receive additional roles.
            new_role: The role to add to the column.

        Raises:
            RuntimeError: If the column has no role yet.
        """
        self._check_column(column)
        self._check_role(new_role)
        if column not in self.roles.keys():
            raise RuntimeError(f"{column} has no roles yet, use update_role instead.")
        self.roles[column] += [new_role]

    def update_role(self, column: str, new_role: str, old_role: str = None):
        """Adds a new role for a column without roles or changes an existing role to a different one.

        Args:
            column: The column to update the roles of.
            new_role: The role to add or change to.
            old_role: Defaults to None. The role to be changed.

        Raises:
            ValueError: If old_role is given but column has no roles.
                If old_role is given but column has no role old_role.
                If no old_role is given but column has multiple roles already.
        """
        if isinstance(column, list):
            for col in column:
                self.update_role(col, new_role, old_role)
            return self
        self._check_column(column)
        self._check_role(new_role)
        if old_role is not None:
            if column not in self.roles.keys():
                raise ValueError(
                    f"Attempted to update role of {column} from {old_role} to {new_role} "
                    f"but {column} does not have a role yet."
                )
            elif old_role not in self.roles[column]:
                raise ValueError(
                    f"Attempted to set role of {column} from {old_role} to {new_role} "
                    f"but {old_role} not among current roles: {self.roles[column]}."
                )
            self.roles[column].remove(old_role)
            self.roles[column].append(new_role)
        else:
            if column not in self.roles.keys() or len(self.roles[column]) == 1:
                self.roles[column] = [new_role]
            else:
                raise ValueError(
                    f"Attempted to update role of {column} to {new_role} but "
                    f"{column} has more than one current roles: {self.roles[column]}"
                )

    def select_dtypes(self, include=None):
        # if(isinstance(include,[str])):
        dtypes = self.get_str_dtypes()
        selected = [key for key, value in dtypes.items() if value in include]
        return selected

    def get_dtypes(self):
        dtypes = list(self.schema.values())
        return dtypes

    def get_str_dtypes(self):
        """ "
        Helper function for polar dataframes to return schema with dtypes as strings
        """
        dtypes = self.get_schema()
        return {key: str(value) for key, value in dtypes.items()}
        # return list(map(dtypes, cast()))

    def get_schema(self):
        if self.backend == Backend.POLARS:
            return self.data.schema
        else:
            return self.data.dtypes

    def get_df(self):
        return self.to_df()

    def set_df(self, df):
        self.data = df

    def groupby(self, by):
        if self.backend == Backend.POLARS:
            return self.data.group_by(by)
        else:
            return self.data.groupby(by)

    def get_backend(self):
        return self.backend

    def __setitem__(self, idx, val):
        if self.backend == Backend.POLARS:
            self.data[idx] = val
        else:
            if isinstance(idx, tuple):
                rows, column = idx
                self.data.loc[rows, column] = val
            else:
                # Use assign for single column assignment to avoid fragmentation
                if isinstance(idx, str):
                    self.data = self.data.assign(**{idx: val})
                else:
                    # For non-string indices, use a more efficient approach
                    # that avoids the fragmentation warning
                    import warnings

                    with warnings.catch_warnings():
                        warnings.filterwarnings(
                            "ignore", message="DataFrame is highly fragmented", category=pd.errors.PerformanceWarning
                        )
                        self.data[idx] = val

    @overload
    def __getitem__(self, list: list[str]) -> pl.DataFrame:
        return self.data[list]

    def __getitem__(self, idx: int) -> pl.Series:
        return self.data[idx]

add_role(column, new_role)

Adds an additional role for a column that already has roles.

Parameters:

Name Type Description Default
column str

The column to receive additional roles.

required
new_role str

The role to add to the column.

required

Raises:

Type Description
RuntimeError

If the column has no role yet.

Source code in recipies/ingredients.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def add_role(self, column: str, new_role: str):
    """Adds an additional role for a column that already has roles.

    Args:
        column: The column to receive additional roles.
        new_role: The role to add to the column.

    Raises:
        RuntimeError: If the column has no role yet.
    """
    self._check_column(column)
    self._check_role(new_role)
    if column not in self.roles.keys():
        raise RuntimeError(f"{column} has no roles yet, use update_role instead.")
    self.roles[column] += [new_role]

get_str_dtypes()

" Helper function for polar dataframes to return schema with dtypes as strings

Source code in recipies/ingredients.py
194
195
196
197
198
199
def get_str_dtypes(self):
    """ "
    Helper function for polar dataframes to return schema with dtypes as strings
    """
    dtypes = self.get_schema()
    return {key: str(value) for key, value in dtypes.items()}

to_df(output_format=None)

Return the underlying DataFrame.

Returns:

Type Description
DataFrame

Self as DataFrame.

Source code in recipies/ingredients.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def to_df(self, output_format=None) -> pl.DataFrame:
    """Return the underlying DataFrame.


    Returns:
        Self as DataFrame.
    """
    if output_format == Backend.POLARS:
        if self.backend == Backend.POLARS:
            return self.data
        else:
            return pl.DataFrame(self.data)
    elif output_format == Backend.PANDAS:
        if self.backend == Backend.POLARS:
            return self.data.to_pandas()
        else:
            return self.data
    else:
        return self.data

update_role(column, new_role, old_role=None)

Adds a new role for a column without roles or changes an existing role to a different one.

Parameters:

Name Type Description Default
column str

The column to update the roles of.

required
new_role str

The role to add or change to.

required
old_role str

Defaults to None. The role to be changed.

None

Raises:

Type Description
ValueError

If old_role is given but column has no roles. If old_role is given but column has no role old_role. If no old_role is given but column has multiple roles already.

Source code in recipies/ingredients.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
def update_role(self, column: str, new_role: str, old_role: str = None):
    """Adds a new role for a column without roles or changes an existing role to a different one.

    Args:
        column: The column to update the roles of.
        new_role: The role to add or change to.
        old_role: Defaults to None. The role to be changed.

    Raises:
        ValueError: If old_role is given but column has no roles.
            If old_role is given but column has no role old_role.
            If no old_role is given but column has multiple roles already.
    """
    if isinstance(column, list):
        for col in column:
            self.update_role(col, new_role, old_role)
        return self
    self._check_column(column)
    self._check_role(new_role)
    if old_role is not None:
        if column not in self.roles.keys():
            raise ValueError(
                f"Attempted to update role of {column} from {old_role} to {new_role} "
                f"but {column} does not have a role yet."
            )
        elif old_role not in self.roles[column]:
            raise ValueError(
                f"Attempted to set role of {column} from {old_role} to {new_role} "
                f"but {old_role} not among current roles: {self.roles[column]}."
            )
        self.roles[column].remove(old_role)
        self.roles[column].append(new_role)
    else:
        if column not in self.roles.keys() or len(self.roles[column]) == 1:
            self.roles[column] = [new_role]
        else:
            raise ValueError(
                f"Attempted to update role of {column} to {new_role} but "
                f"{column} has more than one current roles: {self.roles[column]}"
            )

Recipe

Recipe for preprocessing data

A Recipe object combines a pandas-like Ingredients object with one or more sklearn-inspired transformation Steps to turn into a model-ready input.

Parameters:

Name Type Description Default
data Ingredients | DataFrame | DataFrame

data to be preprocessed.

required
outcomes Union[str, list[str]]

names of columns in data that are assigned the 'outcome' role

None
predictors Union[str, list[str]]

names of columns in data that should be assigned the 'predictor' role

None
groups Union[str, list[str]]

names of columns in data that should be assigned the 'group' role

None
sequences Union[str, list[str]]

names of columns in data that should be assigned the 'sequence' role

None
Source code in recipies/recipe.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class Recipe:
    """Recipe for preprocessing data

    A Recipe object combines a pandas-like Ingredients object with one or more
    sklearn-inspired transformation Steps to turn into a model-ready input.

    Args:
        data: data to be preprocessed.
        outcomes: names of columns in data that are assigned the 'outcome' role
        predictors: names of columns in data that should be assigned the 'predictor' role
        groups: names of columns in data that should be assigned the 'group' role
        sequences: names of columns in data that should be assigned the 'sequence' role
    """

    columns = None
    roles = None

    def __init__(
        self,
        data: Ingredients | pl.DataFrame | pd.DataFrame,
        outcomes: Union[str, list[str]] = None,
        predictors: Union[str, list[str]] = None,
        groups: Union[str, list[str]] = None,
        sequences: Union[str, list[str]] = None,
        backend: Backend = None,
    ):
        if not isinstance(data, Ingredients):
            try:
                data = Ingredients(data, backend=backend)
            except Exception as e:
                raise (f"Expected Ingredients, got {data.__class__} {e}")
        self.data = data
        self.steps = []
        self.original_columns = copy(data.columns)
        self.roles = self.data.roles
        self.columns = self.data.columns

        if outcomes:
            self.update_roles(outcomes, "outcome")
        if predictors:
            self.update_roles(predictors, "predictor")
        if groups:
            self.update_roles(groups, "group")
        if sequences:
            self.update_roles(sequences, "sequence")

    def add_roles(self, vars: Union[str, list[str]], new_role: str = "predictor") -> Recipe:
        """Adds an additional role for one or more columns of the Recipe's Ingredients.

        Args:
            vars: The column to receive additional roles.
            new_role: Defaults to predictor. The role to add to the column.

        See also:
            Ingredients.add_role()

        Returns:
            self
        """
        if isinstance(vars, str):
            vars = [vars]
        for v in vars:
            self.data.add_role(v, new_role)
        return self

    def update_roles(self, vars: Union[str, list[str]], new_role: str = "predictor", old_role: str = None) -> Recipe:
        """Adds a new role for one or more columns of the Recipe's Ingredients without roles
        or changes an existing role to a different one.

        Args:
            vars: The column to receive additional roles.
            new_role: Defaults to predictor'. The role to add or change to.
            old_role: Defaults to None. The role to be changed.

        See also:
            Ingredients.update_role()

        Returns:
            self
        """
        if isinstance(vars, str):
            vars = [vars]
        for v in vars:
            self.data.update_role(v, new_role, old_role)
        return self

    def add_step(self, step: Step) -> Recipe:
        """Adds a new step to the Recipe

        Args:
            step: a transformation step that should be applied to the Ingredients during prep() and bake()

        Returns:
            self
        """
        self.steps.append(step)
        return self

    def _check_data(self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients]) -> Ingredients:
        if data is None:
            data = self.data
        elif isinstance(data, pl.DataFrame) or isinstance(data, pd.DataFrame):
            # this is only executed when prep or bake receive a DF that is different to the original data
            # don't check the roles here, because self.data can have more roles than data (post feature generation)
            data = Ingredients(data, roles=self.data.roles, check_roles=False)
        # if not data.columns.equals(self.data.columns):
        if not set(data.columns) == set(self.original_columns):
            raise ValueError(
                f"Columns of data argument differs from recipe data: "
                f"{[x for x in data.columns if x not in self.original_columns]}."
            )
        return data

    def _apply_group(self, data, step):
        if step.group:
            group_vars = select_groups(data)
            if len(group_vars) > 0:
                data.groupby(group_vars)
        return data

    def prep(
        self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None, refit: bool = False
    ) -> pl.DataFrame | pd.DataFrame:
        """Fits and transforms, in other words preps, the data.

        Args:
            data: Data to fit and transform. Defaults to None.
            refit: Defaults to False. Whether to refit data.

        Returns:
            Transformed data.
        """
        data = self._check_data(data)
        # Todo: check why the roles disappear after copying
        data = copy(data)
        data = self._apply_fit_transform(data, refit)
        # return pl.DataFrame(data)
        return data.get_df()

    def bake(self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None) -> pl.DataFrame | pd.DataFrame:
        """Transforms, or bakes, the data if it has been prepped.

        Args:
            data: Data to transform. Defaults to None.

        Returns:
            Transformed data.
        """
        data = self._check_data(data)
        # original_data = deepcopy(data)
        data = self._apply_fit_transform(data)
        # return pl.DataFrame(data)
        return data.get_df()

    def _apply_fit_transform(self, data=None, refit=False):
        # applies transform or fit and transform (when refit or not trained yet)
        if data is None:
            data = self.data
        for step in self.steps:
            data = self._apply_group(data, step)
            if refit or not step.trained:
                data = step.fit_transform(data)
            else:
                data = step.transform(data)
        return data

    def __repr__(self):
        repr = "Recipe\n\n"

        # Print all existing roles and how many variables are assigned to each
        num_roles = Counter(chain.from_iterable(self.data.roles.values()))
        num_roles = pd.DataFrame(
            {"role": [r for r in num_roles.keys()], "amount of variables": [n for n in num_roles.values()]}
        )
        repr += "Inputs:\n\n" + num_roles.__repr__() + "\n\n"

        # Print all steps
        repr += "Operations:\n\n"
        for step in self.steps:
            repr += str(step) + "\n"

        return repr

    def get_backend(self):
        return self.data.get_backend()

    def cache(self):
        """Prepares the recipe for caching"""
        if self.data is not None:
            del self.data
        return self

add_roles(vars, new_role='predictor')

Adds an additional role for one or more columns of the Recipe's Ingredients.

Parameters:

Name Type Description Default
vars Union[str, list[str]]

The column to receive additional roles.

required
new_role str

Defaults to predictor. The role to add to the column.

'predictor'
See also

Ingredients.add_role()

Returns:

Type Description
Recipe

self

Source code in recipies/recipe.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def add_roles(self, vars: Union[str, list[str]], new_role: str = "predictor") -> Recipe:
    """Adds an additional role for one or more columns of the Recipe's Ingredients.

    Args:
        vars: The column to receive additional roles.
        new_role: Defaults to predictor. The role to add to the column.

    See also:
        Ingredients.add_role()

    Returns:
        self
    """
    if isinstance(vars, str):
        vars = [vars]
    for v in vars:
        self.data.add_role(v, new_role)
    return self

add_step(step)

Adds a new step to the Recipe

Parameters:

Name Type Description Default
step Step

a transformation step that should be applied to the Ingredients during prep() and bake()

required

Returns:

Type Description
Recipe

self

Source code in recipies/recipe.py
102
103
104
105
106
107
108
109
110
111
112
def add_step(self, step: Step) -> Recipe:
    """Adds a new step to the Recipe

    Args:
        step: a transformation step that should be applied to the Ingredients during prep() and bake()

    Returns:
        self
    """
    self.steps.append(step)
    return self

bake(data=None)

Transforms, or bakes, the data if it has been prepped.

Parameters:

Name Type Description Default
data Union[DataFrame | DataFrame, Ingredients]

Data to transform. Defaults to None.

None

Returns:

Type Description
DataFrame | DataFrame

Transformed data.

Source code in recipies/recipe.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def bake(self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None) -> pl.DataFrame | pd.DataFrame:
    """Transforms, or bakes, the data if it has been prepped.

    Args:
        data: Data to transform. Defaults to None.

    Returns:
        Transformed data.
    """
    data = self._check_data(data)
    # original_data = deepcopy(data)
    data = self._apply_fit_transform(data)
    # return pl.DataFrame(data)
    return data.get_df()

cache()

Prepares the recipe for caching

Source code in recipies/recipe.py
202
203
204
205
206
def cache(self):
    """Prepares the recipe for caching"""
    if self.data is not None:
        del self.data
    return self

prep(data=None, refit=False)

Fits and transforms, in other words preps, the data.

Parameters:

Name Type Description Default
data Union[DataFrame | DataFrame, Ingredients]

Data to fit and transform. Defaults to None.

None
refit bool

Defaults to False. Whether to refit data.

False

Returns:

Type Description
DataFrame | DataFrame

Transformed data.

Source code in recipies/recipe.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def prep(
    self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None, refit: bool = False
) -> pl.DataFrame | pd.DataFrame:
    """Fits and transforms, in other words preps, the data.

    Args:
        data: Data to fit and transform. Defaults to None.
        refit: Defaults to False. Whether to refit data.

    Returns:
        Transformed data.
    """
    data = self._check_data(data)
    # Todo: check why the roles disappear after copying
    data = copy(data)
    data = self._apply_fit_transform(data, refit)
    # return pl.DataFrame(data)
    return data.get_df()

update_roles(vars, new_role='predictor', old_role=None)

Adds a new role for one or more columns of the Recipe's Ingredients without roles or changes an existing role to a different one.

Parameters:

Name Type Description Default
vars Union[str, list[str]]

The column to receive additional roles.

required
new_role str

Defaults to predictor'. The role to add or change to.

'predictor'
old_role str

Defaults to None. The role to be changed.

None
See also

Ingredients.update_role()

Returns:

Type Description
Recipe

self

Source code in recipies/recipe.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def update_roles(self, vars: Union[str, list[str]], new_role: str = "predictor", old_role: str = None) -> Recipe:
    """Adds a new role for one or more columns of the Recipe's Ingredients without roles
    or changes an existing role to a different one.

    Args:
        vars: The column to receive additional roles.
        new_role: Defaults to predictor'. The role to add or change to.
        old_role: Defaults to None. The role to be changed.

    See also:
        Ingredients.update_role()

    Returns:
        self
    """
    if isinstance(vars, str):
        vars = [vars]
    for v in vars:
        self.data.update_role(v, new_role, old_role)
    return self

Step

This class represents a step in a recipe.

Steps are transformations to be executed on selected columns of a DataFrame. They fit a transformer to the selected columns and afterwards transform the data with the fitted transformer.

Parameters:

Name Type Description Default
sel Selector

Object that holds information about the selected columns.

all_predictors()

Attributes:

Name Type Description
columns

List with the names of the selected columns.

Source code in recipies/step.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
class Step:
    """This class represents a step in a recipe.

    Steps are transformations to be executed on selected columns of a DataFrame.
    They fit a transformer to the selected columns and afterwards transform the data with the fitted transformer.

    Args:
        sel: Object that holds information about the selected columns.

    Attributes:
        columns: List with the names of the selected columns.
    """

    def __init__(self, sel: Selector = all_predictors(), supported_backends: list[Backend] = [Backend.POLARS, Backend.PANDAS]):
        self.sel = sel
        self.columns = []
        self._trained = False
        self._group = True
        self.supported_backends = supported_backends

    @property
    def trained(self) -> bool:
        return self._trained

    @property
    def group(self) -> bool:
        return self._group

    def fit(self, data: Ingredients):
        """This function fits the transformer to the data.

        Args:
            data: The DataFrame to fit to.
        """
        data = self._check_ingredients(data)
        self.columns = self.sel(data)
        self.do_fit(data)
        self._trained = True

    @abstractmethod
    def do_fit(self, data: Ingredients):
        pass

    def _check_ingredients(self, data: Union[Ingredients, GroupBy | DataFrameGroupBy]) -> Ingredients:
        """Check input for allowed types

        Args:
            data: input to the step

        Raises:
            ValueError: If a grouped pd.DataFrame is provided to a step that can't use groups.
            ValueError: If input are not (potentially grouped) Ingredients.

        Returns:
            Validated input
        """
        if isinstance(data, GroupBy) or isinstance(data, DataFrameGroupBy):
            if not self._group:
                raise ValueError("Step does not accept grouped data.")
        if not isinstance(data, Ingredients):
            raise ValueError(f"Expected Ingredients object, got {data.__class__}")
        if self.supported_backends is not None and data.get_backend() not in self.supported_backends:
            raise ValueError(f"{data.get_backend()} not supported by this step.")
        return data

    def transform(self, data: Ingredients) -> Ingredients:
        """This function transforms the data with the fitted transformer.

        Args:
            data: The DataFrame to transform.

        Returns:
            The transformed DataFrame.
        """
        pass

    def fit_transform(self, data: Ingredients) -> Ingredients:
        self.fit(data)
        return self.transform(data)

    def __repr__(self) -> str:
        repr = self.desc + " for "

        if not self.trained:
            repr += str(self.sel)
        else:
            repr += str(self.columns) if len(self.columns) < 3 else str(self.columns[:2] + ["..."])  # FIXME: remove brackets
            repr += " [trained]"

        return repr

fit(data)

This function fits the transformer to the data.

Parameters:

Name Type Description Default
data Ingredients

The DataFrame to fit to.

required
Source code in recipies/step.py
57
58
59
60
61
62
63
64
65
66
def fit(self, data: Ingredients):
    """This function fits the transformer to the data.

    Args:
        data: The DataFrame to fit to.
    """
    data = self._check_ingredients(data)
    self.columns = self.sel(data)
    self.do_fit(data)
    self._trained = True

transform(data)

This function transforms the data with the fitted transformer.

Parameters:

Name Type Description Default
data Ingredients

The DataFrame to transform.

required

Returns:

Type Description
Ingredients

The transformed DataFrame.

Source code in recipies/step.py
 94
 95
 96
 97
 98
 99
100
101
102
103
def transform(self, data: Ingredients) -> Ingredients:
    """This function transforms the data with the fitted transformer.

    Args:
        data: The DataFrame to transform.

    Returns:
        The transformed DataFrame.
    """
    pass

StepFunction

Bases: Step

Provides a wrapper for a simple transformation function, without fitting.

Source code in recipies/step.py
652
653
654
655
656
657
658
659
660
661
662
663
664
665
class StepFunction(Step):
    """Provides a wrapper for a simple transformation function, without fitting."""

    def __init__(self, function, sel: Selector = all_predictors()):
        print(f"sel: {sel}")
        super().__init__(sel=sel)
        self.function = function
        self._trained = True

    def transform(self, data: Ingredients) -> Ingredients:
        new_data = self._check_ingredients(data)
        self.columns = self.sel(new_data)
        new_data = self.function(new_data, self.columns)
        return new_data

StepHistorical

Bases: Step

This step generates columns with a historical accumulator provided by the user.

Parameters:

Name Type Description Default
fun Accumulator

Instance of the Accumulator enumerable that signifies which type of historical accumulation to use (default is MAX).

MAX
suffix str

Defaults to none. Set the name to have the step generate new columns with this suffix instead of the default suffix.

None
role str

Defaults to 'predictor'. In case new columns are added, set their role to role.

'predictor'
Source code in recipies/step.py
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
class StepHistorical(Step):
    """This step generates columns with a historical accumulator provided by the user.

    Args:
        fun: Instance of the Accumulator enumerable that signifies which type of historical accumulation
            to use (default is MAX).
        suffix: Defaults to none. Set the name to have the step generate new columns with this suffix
            instead of the default suffix.
        role: Defaults to 'predictor'. In case new columns are added, set their role to role.
    """

    def __init__(
        self,
        sel: Selector = all_numeric_predictors(),
        fun: Accumulator = Accumulator.MAX,
        suffix: str = None,
        role: str = "predictor",
    ):
        super().__init__(sel)

        self.desc = f"Create historical {fun}"
        self.fun = fun
        if isinstance(self.fun, Accumulator):
            pass
        else:
            raise TypeError(f"Expected Accumulator enum for function, got {self.fun.__class__}")
        if suffix is None:
            suffix = fun.value
        self.suffix = suffix
        self.role = role

    def transform(self, data: Ingredients) -> Ingredients:
        """
        Raises:
            TypeError: If the function is not of type Accumulator
        """

        new_data = self._check_ingredients(data)
        self.suffix = self.suffix
        new_columns = [c + self.suffix for c in self.columns]

        selected = new_data.data
        selected_cols = pl.col(self.columns)
        id = select_groups(new_data)
        if data.get_backend() == Backend.POLARS:
            if self.fun is Accumulator.MAX:
                res = selected.with_columns(selected_cols.cum_max().over(id).name.suffix(self.suffix))
            elif self.fun is Accumulator.MIN:
                res = selected.with_columns(selected_cols.cum_min().over(id).name.suffix(self.suffix))
            elif self.fun is Accumulator.MEAN:
                res = selected.with_columns(
                    selected_cols.rolling_mean(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
                )
            elif self.fun is Accumulator.MEDIAN:
                res = selected.with_columns(
                    selected_cols.rolling_median(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
                )
            elif self.fun is Accumulator.COUNT:
                res = selected.with_columns(selected_cols.cum_count().over(id).name.suffix(self.suffix))
            elif self.fun is Accumulator.VAR:
                res = selected.with_columns(
                    selected_cols.rolling_var(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
                )
            else:
                raise TypeError(f"Expected Accumulator enum for function, got {self.fun.__class__}")
            new_data.set_df(res)
        else:
            data = data.groupby(id)
            if self.fun is Accumulator.MAX:
                res = data[self.columns].cummax(skipna=True)
            elif self.fun is Accumulator.MIN:
                res = data[self.columns].cummin(skipna=True)
            elif self.fun is Accumulator.MEAN:
                # Reset index, as we get back a multi-index, and we want a simple rolling index
                res = data[self.columns].expanding().mean().reset_index(drop=True)
            elif self.fun is Accumulator.MEDIAN:
                res = data[self.columns].expanding().median().reset_index(drop=True)
            elif self.fun is Accumulator.COUNT:
                res = data[self.columns].expanding().count().reset_index(drop=True)
            elif self.fun is Accumulator.VAR:
                res = data[self.columns].expanding().var().reset_index(drop=True)
            else:
                raise TypeError(f"Expected Accumulator enum for function, got {self.fun.__class__}")
            # df = new_data.get_df()
            # df[new_columns] = res
            # new_data.set_df(df)
            new_data.set_df(new_data.get_df().assign(**{new_columns[i]: res.iloc[:, i] for i in range(len(new_columns))}))

        for nc in new_columns:
            new_data.update_role(nc, self.role)

        return new_data

transform(data)

Raises:

Type Description
TypeError

If the function is not of type Accumulator

Source code in recipies/step.py
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
def transform(self, data: Ingredients) -> Ingredients:
    """
    Raises:
        TypeError: If the function is not of type Accumulator
    """

    new_data = self._check_ingredients(data)
    self.suffix = self.suffix
    new_columns = [c + self.suffix for c in self.columns]

    selected = new_data.data
    selected_cols = pl.col(self.columns)
    id = select_groups(new_data)
    if data.get_backend() == Backend.POLARS:
        if self.fun is Accumulator.MAX:
            res = selected.with_columns(selected_cols.cum_max().over(id).name.suffix(self.suffix))
        elif self.fun is Accumulator.MIN:
            res = selected.with_columns(selected_cols.cum_min().over(id).name.suffix(self.suffix))
        elif self.fun is Accumulator.MEAN:
            res = selected.with_columns(
                selected_cols.rolling_mean(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
            )
        elif self.fun is Accumulator.MEDIAN:
            res = selected.with_columns(
                selected_cols.rolling_median(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
            )
        elif self.fun is Accumulator.COUNT:
            res = selected.with_columns(selected_cols.cum_count().over(id).name.suffix(self.suffix))
        elif self.fun is Accumulator.VAR:
            res = selected.with_columns(
                selected_cols.rolling_var(window_size=selected.height, min_samples=0).over(id).name.suffix(self.suffix)
            )
        else:
            raise TypeError(f"Expected Accumulator enum for function, got {self.fun.__class__}")
        new_data.set_df(res)
    else:
        data = data.groupby(id)
        if self.fun is Accumulator.MAX:
            res = data[self.columns].cummax(skipna=True)
        elif self.fun is Accumulator.MIN:
            res = data[self.columns].cummin(skipna=True)
        elif self.fun is Accumulator.MEAN:
            # Reset index, as we get back a multi-index, and we want a simple rolling index
            res = data[self.columns].expanding().mean().reset_index(drop=True)
        elif self.fun is Accumulator.MEDIAN:
            res = data[self.columns].expanding().median().reset_index(drop=True)
        elif self.fun is Accumulator.COUNT:
            res = data[self.columns].expanding().count().reset_index(drop=True)
        elif self.fun is Accumulator.VAR:
            res = data[self.columns].expanding().var().reset_index(drop=True)
        else:
            raise TypeError(f"Expected Accumulator enum for function, got {self.fun.__class__}")
        # df = new_data.get_df()
        # df[new_columns] = res
        # new_data.set_df(df)
        new_data.set_df(new_data.get_df().assign(**{new_columns[i]: res.iloc[:, i] for i in range(len(new_columns))}))

    for nc in new_columns:
        new_data.update_role(nc, self.role)

    return new_data

StepImputeFastForwardFill

Bases: Step

Quick variant of pandas' internal nafill(method='ffill') for grouped dataframes.

Note: this variant does not allow for setting a limit.

Source code in recipies/step.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
class StepImputeFastForwardFill(Step):
    """Quick variant of pandas' internal `nafill(method='ffill')` for grouped dataframes.

    Note: this variant does not allow for setting a limit.
    """

    def __init__(self, sel=all_predictors()):
        super().__init__(sel, supported_backends=[Backend.PANDAS])
        self.desc = "Impute with fast ffill"

    def transform(self, data):
        new_data = self._check_ingredients(data)

        # Use cumsum (which is optimised for grouped frames) to figure out which
        # values should be left at NaN, then ffill on the ungrouped dataframe. Adopted from:
        # https://stackoverflow.com/questions/36871783/fillna-forward-fill-on-a-large-dataframe-efficiently-with-groupby
        df = new_data.get_df()
        nofill = df.copy()
        nofill[self.columns] = pd.notnull(nofill[self.columns])
        nofill = nofill.groupby(select_groups(new_data))[self.columns].cumsum()

        df[self.columns] = df[self.columns].ffill()
        for col in self.columns:
            df.loc[nofill[col].to_numpy() == 0, col] = np.nan
        new_data.set_df(df)
        return new_data

StepImputeFastZeroFill

Bases: Step

Quick variant of pandas' internal nafill(value=0) for grouped dataframes.

Source code in recipies/step.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
class StepImputeFastZeroFill(Step):
    """Quick variant of pandas' internal `nafill(value=0)` for grouped dataframes."""

    def __init__(self, sel=all_predictors()):
        super().__init__(sel, supported_backends=[Backend.PANDAS])
        self.desc = "Impute quickly with 0"

    def transform(self, data):
        new_data = self._check_ingredients(data)

        # Ignore grouping as grouping does not matter for zero fill.
        new_data[self.columns] = new_data[self.columns].fillna(0)

        return new_data

StepImputeFill

Bases: Step

For Pandas: uses pandas' internal nafill function to replace missing values. See pandas.DataFrame.nafill for a description of the arguments.

Source code in recipies/step.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
class StepImputeFill(Step):
    """For Pandas: uses pandas' internal `nafill` function to replace missing values.
    See `pandas.DataFrame.nafill` for a description of the arguments.
    """

    def __init__(self, sel=all_predictors(), value=None, strategy=None, limit=None):
        super().__init__(sel)
        self.desc = f"Impute with {strategy if strategy else value}"
        self.value = value
        self.strategy = strategy
        self.limit = limit

    def transform(self, data):
        new_data = self._check_ingredients(data)
        groups = select_groups(new_data)
        if data.get_backend() == Backend.POLARS:
            available_strategies = list(get_args(pl._typing.FillNullStrategy))
            if self.strategy in available_strategies or self.value is not None:
                if len(groups) > 0:
                    new_data.data = data.data.with_columns(
                        pl.col(self.columns).fill_null(self.value, strategy=self.strategy, limit=self.limit).over(groups)
                    )
                else:
                    new_data.data = data.data.with_columns(
                        pl.col(self.columns).fill_null(self.value, strategy=self.strategy, limit=self.limit)
                    )
            else:
                raise ValueError(
                    f"No valid strategy provided. Strategy was: {self.strategy}, valid strategies are: {available_strategies}"
                )
        else:
            # Pandas syntax
            func = None
            if self.strategy == "forward":
                func = pd.core.groupby.SeriesGroupBy.ffill
            elif self.strategy == "backward":
                func = pd.core.groupby.SeriesGroupBy.bfill
            elif self.strategy == "zero":
                self.value = 0
            elif self.value is None:
                raise ValueError(f"No valid strategy provided. Strategy was: {self.strategy}")

            if len(groups) > 0:
                df = new_data.groupby(groups)
            else:
                df = new_data.get_df()
            # [self.columns] = data.groupby(groups)[self.columns].fillna(self.value, method=self.strategy, limit=self.limit)
            new_df = new_data.get_df()
            if self.value is not None:
                # If value is set, fill with value
                if isinstance(df, GroupBy) or isinstance(df, DataFrameGroupBy):
                    # This type of imputation can only be done with ungrouped data
                    df = df.obj
                updated_columns = {col: df[col].fillna(self.value) for col in self.columns}
            else:
                # if func is None:
                #     # updated_columns = {col: df[col].fillna(self.value, method=self.strategy, limit=self.limit) for col in
                #     #                    self.columns}
                # else:
                updated_columns = {col: func(df[col]) for col in self.columns}

            # Use pd.concat to update the DataFrame in one go
            new_df = pd.concat([new_df.drop(columns=self.columns), pd.DataFrame(updated_columns)], axis=1)
            df = new_df
            new_data.set_df(df)
        return new_data

StepImputeModel

Bases: Step

Uses a pretrained imputation model to impute missing values. Args: model: A function that takes a dataframe and the grouping columns as input and returns a dataframe with imputed values without the grouping column.

Source code in recipies/step.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
class StepImputeModel(Step):
    """Uses a pretrained imputation model to impute missing values.
    Args:
        model: A function that takes a dataframe and the grouping columns as input and
            returns a dataframe with imputed values without the grouping column.
    """

    def __init__(self, sel=all_predictors(), model=None):
        super().__init__(sel)
        self.desc = "Impute with pretrained imputation model"
        self.model = model

    def transform(self, data):
        new_data = self._check_ingredients(data)
        if data.get_backend() == Backend.POLARS:
            new_data[self.columns] = self.model(new_data[self.columns + select_groups(new_data)], select_groups(new_data))
        return new_data

StepResampling

Bases: Step

Source code in recipies/step.py
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
class StepResampling(Step):
    def __init__(
        self,
        new_resolution: str = "1h",
        accumulator_dict: Dict[Selector, Accumulator] = None,
        default_accumulator: Accumulator = Accumulator.LAST,
    ):
        """This class represents a resampling step in a recipe.

        Args:
            new_resolution: Resolution to resample to.
            accumulator_dict: Supply dictionary with individual accumulation methods for each Selector.
            default_accumulator: Accumulator to use for variables not supplied in dictionary.
        """
        super().__init__()
        self.new_resolution = new_resolution
        self.acc_dict = accumulator_dict or {all_predictors(): Accumulator.LAST}
        self.default_accumulator = default_accumulator
        self._group = True

    def do_fit(self, data: Ingredients):
        self._trained = True

    def transform(self, data):
        new_data = self._check_ingredients(data)

        # Check for and save first sequence role
        if select_sequence(new_data) is not None:
            sequence_role = select_sequence(new_data)[0]
        else:
            raise AssertionError("Sequence role has not been assigned, resampling step not possible")
        sequence_datatype = new_data.dtypes[sequence_role]

        # if not (isinstance(pl.datatypes.TemporalType,sequence_datatype)): #or is_datetime64_any_dtype(sequence_datatype)):
        if data.get_backend() == Backend.POLARS and not (
            sequence_datatype.is_temporal()
        ):  # or is_datetime64_any_dtype(sequence_datatype)):
            raise ValueError(f"Expected Timedelta or Timestamp object, got {sequence_role(data).__class__}")
        if data.get_backend() == Backend.PANDAS and not (
            is_timedelta64_dtype(sequence_datatype) or is_datetime64_any_dtype(sequence_datatype)
        ):
            raise ValueError(f"Expected Timedelta or Timestamp object, got {sequence_role(data).__class__}")

        # Dictionary with the format column: str , accumulator:str is created
        col_acc_map = {}
        # Go through supplied Selector, Accumulator pairs
        for selector, accumulator in self.acc_dict.items():
            selected_columns = selector(new_data)
            # Add variables associated with selector with supplied accumulator
            col_acc_map.update({col: accumulator.value for col in selected_columns})

        # Add non-specified variables, if not a sequence role
        col_acc_map.update(
            {
                col: self.default_accumulator.value
                for col in set(new_data.columns).difference(col_acc_map.keys())
                if col not in select_sequence(new_data)
            }
        )
        # acc_col_map = dict((v, k) for k, v in col_acc_map.items())
        if data.get_backend() == Backend.POLARS:
            from collections import defaultdict

            acc_col_map = defaultdict(list)
            for k, v in col_acc_map.items():
                acc_col_map[v].append(k)
            if len(select_groups(new_data)) > 0:
                grouping_role = select_groups(new_data)[0]
                # Resampling with the functions defined in col_acc_map
                new_data.set_df(new_data.get_df().sort(grouping_role, sequence_role).set_sorted(sequence_role))
                new_data.set_df(
                    new_data.get_df()
                    .upsample(every=self.new_resolution, time_column=sequence_role, group_by=grouping_role)
                    .with_columns(pl.col(acc_col_map["last"]).fill_null(strategy="forward"))
                    .with_columns(pl.col(acc_col_map["mean"]).fill_null(strategy="mean"))
                    .with_columns(pl.col(acc_col_map["max"]).fill_null(strategy="max"))
                    .with_columns(pl.col(grouping_role).fill_null(strategy="forward"))
                )
            else:
                new_data.set_df(new_data.get_df().sort(sequence_role).set_sorted(sequence_role))
                new_data.set_df(
                    new_data.get_df()
                    .upsample(every=self.new_resolution, time_column=sequence_role)
                    .with_columns(pl.col(acc_col_map["last"]).fill_null(strategy="forward"))
                    .with_columns(pl.col(acc_col_map["mean"]).fill_null(strategy="mean"))
                    .with_columns(pl.col(acc_col_map["max"]).fill_null(strategy="max"))
                )
        else:
            # Resampling with the functions defined in col_acc_map
            if len(select_groups(new_data)) > 0:
                df = data.groupby(select_groups(data))
            else:
                df = data.get_df()
            new_data.set_df(df.resample(self.new_resolution, on=sequence_role).agg(col_acc_map))

            # Remove multi-index in case of grouped data
            if isinstance(data.get_df(), DataFrameGroupBy):
                new_data = new_data.set_df(new_data.get_df().droplevel(select_groups(data.get_df().obj)))

            # Remove sequence index, while keeping column
            # new_data = new_data.set_df(new_data.get_df().reset_index(drop=False))
        return new_data

__init__(new_resolution='1h', accumulator_dict=None, default_accumulator=Accumulator.LAST)

This class represents a resampling step in a recipe.

Parameters:

Name Type Description Default
new_resolution str

Resolution to resample to.

'1h'
accumulator_dict Dict[Selector, Accumulator]

Supply dictionary with individual accumulation methods for each Selector.

None
default_accumulator Accumulator

Accumulator to use for variables not supplied in dictionary.

LAST
Source code in recipies/step.py
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
def __init__(
    self,
    new_resolution: str = "1h",
    accumulator_dict: Dict[Selector, Accumulator] = None,
    default_accumulator: Accumulator = Accumulator.LAST,
):
    """This class represents a resampling step in a recipe.

    Args:
        new_resolution: Resolution to resample to.
        accumulator_dict: Supply dictionary with individual accumulation methods for each Selector.
        default_accumulator: Accumulator to use for variables not supplied in dictionary.
    """
    super().__init__()
    self.new_resolution = new_resolution
    self.acc_dict = accumulator_dict or {all_predictors(): Accumulator.LAST}
    self.default_accumulator = default_accumulator
    self._group = True

StepScale

Bases: StepSklearn

Provides a wrapper for a scaling with StepSklearn. Note that because SKlearn transforms None (nulls) to NaN, we have to revert.

Parameters:

Name Type Description Default
with_mean bool

Defaults to True. If True, center the data before scaling.

True
with_std bool

Defaults to True. If True, scale the data to unit variance (or equivalently, unit standard deviation).

True
in_place bool

Defaults to True. Set to False to have the step generate new columns instead of overwriting the existing ones.

True
Source code in recipies/step.py
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
class StepScale(StepSklearn):
    """Provides a wrapper for a scaling with StepSklearn.
    Note that because SKlearn transforms None (nulls) to NaN, we have to revert.

    Args:
       with_mean: Defaults to True. If True, center the data before scaling.
       with_std: Defaults to True. If True, scale the data to unit variance (or equivalently, unit standard deviation).
       in_place: Defaults to True. Set to False to have the step generate new columns instead of overwriting the existing ones.
    """

    def __init__(
        self,
        sel=all_numeric_predictors(),
        with_mean: bool = True,
        with_std: bool = True,
        in_place: bool = True,
        *args,
        **kwargs,
    ):
        super().__init__(
            sklearn_transformer=StandardScaler(with_mean=with_mean, with_std=with_std),
            sel=sel,
            in_place=in_place,
            *args,
            **kwargs,
        )
        self.desc = "Scale with StandardScaler"

    def transform(self, data: Ingredients) -> Ingredients:
        data = super().transform(data)
        # Revert null to nan conversion done by sklearn
        if data.get_backend() == Backend.POLARS:
            data.set_df(data.get_df().with_columns(pl.col(self.columns).fill_nan(None)))
        # else:
        #     data.set_df(data.get_df()[self.columns].fillna(value=None))
        return data

StepSklearn

Bases: Step

This step takes a transformer from scikit-learn and makes it usable as a step in a recipe.

Parameters:

Name Type Description Default
sklearn_transformer object

Instance of scikit-learn transformer that implements fit() and transform().

required
columnwise bool

Defaults to False. Set to True to fit and transform the DF column by column.

False
in_place bool

Defaults to True. Set to False to have the step generate new columns instead of overwriting the existing ones.

True
role str

Defaults to 'predictor'. In case new columns are added, set their role to role.

'predictor'
Source code in recipies/step.py
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
class StepSklearn(Step):
    """This step takes a transformer from scikit-learn and makes it usable as a step in a recipe.

    Args:
        sklearn_transformer: Instance of scikit-learn transformer that implements fit() and transform().
        columnwise: Defaults to False. Set to True to fit and transform the DF column by column.
        in_place: Defaults to True. Set to False to have the step generate new columns
            instead of overwriting the existing ones.
        role (str, optional): Defaults to 'predictor'. In case new columns are added, set their role to role.
    """

    def __init__(
        self,
        sklearn_transformer: object,
        sel: Selector = all_predictors(),
        columnwise: bool = False,
        in_place: bool = True,
        role: str = "predictor",
    ):
        super().__init__(sel)
        self.desc = f"Use sklearn transformer {sklearn_transformer.__class__.__name__}"
        self.sklearn_transformer = sklearn_transformer
        self.columnwise = columnwise
        self.in_place = in_place
        self.role = role
        self._group = False

    def do_fit(self, data: Ingredients) -> Ingredients:
        """
        Raises:
            ValueError: If the transformer expects a single column but gets multiple.
        """
        if self.columnwise:
            self._transformers = {
                # copy the transformer so we keep the distinct fit for each column and don't just refit
                col: deepcopy(self.sklearn_transformer.fit(data[col]))
                for col in self.columns
            }
        else:
            try:
                # print(data[self.columns])
                self.sklearn_transformer.fit(data[self.columns])
            except ValueError as e:
                if "should be a 1d array" in str(e) or "Multioutput target data is not supported" in str(e):
                    raise ValueError(
                        "The sklearn transformer expects a 1d array as input. Try running the step with columnwise=True."
                    )
                raise

    def transform(self, data: Ingredients) -> Ingredients:
        """
        Raises:
            TypeError: If the transformer returns a sparse matrix.
            ValueError: If the transformer returns an unexpected amount of columns.
        """
        new_data = self._check_ingredients(data)

        if self.columnwise:
            for col in self.columns:
                new_cols = self._transformers[col].transform(new_data[col])
                if self.in_place and new_cols.ndim == 2 and new_cols.shape[1] > 1:
                    raise ValueError(
                        "The sklearn transformer returned more than one column. Try running the step with in_place=False."
                    )
                col_names = (
                    col
                    if self.in_place
                    else [f"{self.sklearn_transformer.__class__.__name__}_{col}_{i + 1}" for i in range(new_cols.shape[1])]
                )
                if data.get_backend() == Backend.POLARS:
                    if isinstance(col_names, str):
                        col_names = [col_names]
                    updated_cols = pl.from_numpy(new_cols, schema=col_names)
                    new_data.data = new_data.data.with_columns(updated_cols)
                else:
                    df = new_data.get_df()
                    df[col_names] = new_cols
                    new_data.set_df(df)
        else:
            new_cols = self.sklearn_transformer.transform(new_data[self.columns])
            if isspmatrix(new_cols):
                raise TypeError(
                    "The sklearn transformer returns a sparse matrix, "
                    "but recipes expects a dense numpy representation. "
                    "Try setting sparse_output=False or similar in the transformer initialization."
                )

            col_names = (
                self.columns
                if self.in_place
                else (
                    [f"{self.sklearn_transformer.__class__.__name__}_{self.columns[i]}" for i in range(new_cols.shape[1])]
                    if new_cols.shape[1] == len(self.columns)
                    else [f"{self.sklearn_transformer.__class__.__name__}_{i + 1}" for i in range(new_cols.shape[1])]
                )
            )
            if new_cols.shape[1] != len(col_names):
                raise ValueError(
                    "The sklearn transformer returned a different amount of columns. Try running the step with in_place=False."
                )

            new_data[col_names] = new_cols

        # set role of new columns
        if not self.in_place:
            for col in col_names:
                new_data.update_role(col, self.role)

        return new_data

do_fit(data)

Raises:

Type Description
ValueError

If the transformer expects a single column but gets multiple.

Source code in recipies/step.py
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
def do_fit(self, data: Ingredients) -> Ingredients:
    """
    Raises:
        ValueError: If the transformer expects a single column but gets multiple.
    """
    if self.columnwise:
        self._transformers = {
            # copy the transformer so we keep the distinct fit for each column and don't just refit
            col: deepcopy(self.sklearn_transformer.fit(data[col]))
            for col in self.columns
        }
    else:
        try:
            # print(data[self.columns])
            self.sklearn_transformer.fit(data[self.columns])
        except ValueError as e:
            if "should be a 1d array" in str(e) or "Multioutput target data is not supported" in str(e):
                raise ValueError(
                    "The sklearn transformer expects a 1d array as input. Try running the step with columnwise=True."
                )
            raise

transform(data)

Raises:

Type Description
TypeError

If the transformer returns a sparse matrix.

ValueError

If the transformer returns an unexpected amount of columns.

Source code in recipies/step.py
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
def transform(self, data: Ingredients) -> Ingredients:
    """
    Raises:
        TypeError: If the transformer returns a sparse matrix.
        ValueError: If the transformer returns an unexpected amount of columns.
    """
    new_data = self._check_ingredients(data)

    if self.columnwise:
        for col in self.columns:
            new_cols = self._transformers[col].transform(new_data[col])
            if self.in_place and new_cols.ndim == 2 and new_cols.shape[1] > 1:
                raise ValueError(
                    "The sklearn transformer returned more than one column. Try running the step with in_place=False."
                )
            col_names = (
                col
                if self.in_place
                else [f"{self.sklearn_transformer.__class__.__name__}_{col}_{i + 1}" for i in range(new_cols.shape[1])]
            )
            if data.get_backend() == Backend.POLARS:
                if isinstance(col_names, str):
                    col_names = [col_names]
                updated_cols = pl.from_numpy(new_cols, schema=col_names)
                new_data.data = new_data.data.with_columns(updated_cols)
            else:
                df = new_data.get_df()
                df[col_names] = new_cols
                new_data.set_df(df)
    else:
        new_cols = self.sklearn_transformer.transform(new_data[self.columns])
        if isspmatrix(new_cols):
            raise TypeError(
                "The sklearn transformer returns a sparse matrix, "
                "but recipes expects a dense numpy representation. "
                "Try setting sparse_output=False or similar in the transformer initialization."
            )

        col_names = (
            self.columns
            if self.in_place
            else (
                [f"{self.sklearn_transformer.__class__.__name__}_{self.columns[i]}" for i in range(new_cols.shape[1])]
                if new_cols.shape[1] == len(self.columns)
                else [f"{self.sklearn_transformer.__class__.__name__}_{i + 1}" for i in range(new_cols.shape[1])]
            )
        )
        if new_cols.shape[1] != len(col_names):
            raise ValueError(
                "The sklearn transformer returned a different amount of columns. Try running the step with in_place=False."
            )

        new_data[col_names] = new_cols

    # set role of new columns
    if not self.in_place:
        for col in col_names:
            new_data.update_role(col, self.role)

    return new_data

Selector

Class responsible for selecting the variables affected by a recipe step. This class is an iterable

Parameters:

Name Type Description Default
description str

Text used to represent Selector when printed in summaries

required
names Union[str, list[str]]

Column names to select. Defaults to None.

None
roles Union[str, list[str]]

Column roles to select, see also Ingredients. Defaults to None.

None
types Union[str, list[str]]

Column data types to select. Defaults to None.

None
pattern Pattern

Regex pattern to search column names with. Defaults to None.

None
Source code in recipies/selector.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
class Selector:
    """Class responsible for selecting the variables affected by a recipe step.
    This class is an iterable

    Args:
        description: Text used to represent Selector when printed in summaries
        names: Column names to select. Defaults to None.
        roles: Column roles to select, see also Ingredients. Defaults to None.
        types: Column data types to select. Defaults to None.
        pattern: Regex pattern to search column names with. Defaults to None.
    """

    def __init__(
        self,
        description: str,
        names: Union[str, list[str]] = None,
        roles: Union[str, list[str]] = None,
        types: Union[str, list[str]] = None,
        pattern: re.Pattern = None,
    ):
        self.description = description
        self.set_names(names)
        self.set_roles(roles)
        self.set_types(types)
        self.set_pattern(pattern)

    def __iter__(self):
        """Allow Selector to be used as an iterable after being called with Ingredients."""
        if not hasattr(self, "_last_selection"):
            raise AttributeError("Selector must be called with Ingredients before iteration.")
        return iter(self._last_selection)

    def __len__(self):
        """Return the number of selected columns after being called."""
        if not hasattr(self, "_last_selection"):
            raise AttributeError("Selector must be called with Ingredients before getting length.")
        return len(self._last_selection)

    def __getitem__(self, idx):
        """Allow indexing into the selected columns after being called."""
        if not hasattr(self, "_last_selection"):
            raise AttributeError("Selector must be called with Ingredients before indexing.")
        return self._last_selection[idx]

    def set_names(self, names: Union[str, list[str]]):
        """Set the column names to select with this Selector

        Args:
            names: column names to select
        """
        self.names = enlist_str(names)

    def set_roles(self, roles: Union[str, list[str]]):
        """Set the column roles to select with this Selector

        Args:
            roles: column roles to select, see also Ingredients
        """
        self.roles = enlist_str(roles)

    def set_types(self, roles: Union[str, list[str]]):
        """Set the column data types to select with this Selector

        Args:
            roles: column data types to select
        """
        self.types = enlist_str(roles)
        # self.types = enlist_dt(roles)

    def set_pattern(self, pattern: re.Pattern):
        """Set the pattern to search with this Selector

        Args:
            pattern: Regex pattern to search column names with.
        """
        self.pattern = pattern

    def __call__(self, ingr: Ingredients) -> list[str]:
        """Select variables from Ingredients

        Args:
            ingr: object from which to select the variables

        Raises:
            TypeError: when something other than an Ingredient object is passed

        Returns:
            Selected variables.
        """

        if not isinstance(ingr, Ingredients):
            raise TypeError(f"Expected Ingredients, got {ingr.__class__}")

        vars = list(ingr.columns)
        # Pandas
        # vars = ingr.columns.tolist()

        if self.roles is not None:
            # for v, r in ingr.roles.items():
            #     print(v, r)
            #     print(intersection(r, self.roles))
            sel_roles = [v for v, r in ingr.roles.items() if intersection(r, self.roles)]
            vars = intersection(vars, sel_roles)

        if self.types is not None:
            sel_types = list(ingr.select_dtypes(include=self.types))  # .columns.tolist()
            vars = intersection(vars, sel_types)

        if self.names is not None:
            vars = intersection(vars, self.names)

        if self.pattern is not None:
            vars = list(filter(self.pattern.search, vars))
        self._last_selection = vars  # Store last selection for iteration
        return vars

    def __repr__(self):
        return self.description

__call__(ingr)

Select variables from Ingredients

Parameters:

Name Type Description Default
ingr Ingredients

object from which to select the variables

required

Raises:

Type Description
TypeError

when something other than an Ingredient object is passed

Returns:

Type Description
list[str]

Selected variables.

Source code in recipies/selector.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def __call__(self, ingr: Ingredients) -> list[str]:
    """Select variables from Ingredients

    Args:
        ingr: object from which to select the variables

    Raises:
        TypeError: when something other than an Ingredient object is passed

    Returns:
        Selected variables.
    """

    if not isinstance(ingr, Ingredients):
        raise TypeError(f"Expected Ingredients, got {ingr.__class__}")

    vars = list(ingr.columns)
    # Pandas
    # vars = ingr.columns.tolist()

    if self.roles is not None:
        # for v, r in ingr.roles.items():
        #     print(v, r)
        #     print(intersection(r, self.roles))
        sel_roles = [v for v, r in ingr.roles.items() if intersection(r, self.roles)]
        vars = intersection(vars, sel_roles)

    if self.types is not None:
        sel_types = list(ingr.select_dtypes(include=self.types))  # .columns.tolist()
        vars = intersection(vars, sel_types)

    if self.names is not None:
        vars = intersection(vars, self.names)

    if self.pattern is not None:
        vars = list(filter(self.pattern.search, vars))
    self._last_selection = vars  # Store last selection for iteration
    return vars

__getitem__(idx)

Allow indexing into the selected columns after being called.

Source code in recipies/selector.py
47
48
49
50
51
def __getitem__(self, idx):
    """Allow indexing into the selected columns after being called."""
    if not hasattr(self, "_last_selection"):
        raise AttributeError("Selector must be called with Ingredients before indexing.")
    return self._last_selection[idx]

__iter__()

Allow Selector to be used as an iterable after being called with Ingredients.

Source code in recipies/selector.py
35
36
37
38
39
def __iter__(self):
    """Allow Selector to be used as an iterable after being called with Ingredients."""
    if not hasattr(self, "_last_selection"):
        raise AttributeError("Selector must be called with Ingredients before iteration.")
    return iter(self._last_selection)

__len__()

Return the number of selected columns after being called.

Source code in recipies/selector.py
41
42
43
44
45
def __len__(self):
    """Return the number of selected columns after being called."""
    if not hasattr(self, "_last_selection"):
        raise AttributeError("Selector must be called with Ingredients before getting length.")
    return len(self._last_selection)

set_names(names)

Set the column names to select with this Selector

Parameters:

Name Type Description Default
names Union[str, list[str]]

column names to select

required
Source code in recipies/selector.py
53
54
55
56
57
58
59
def set_names(self, names: Union[str, list[str]]):
    """Set the column names to select with this Selector

    Args:
        names: column names to select
    """
    self.names = enlist_str(names)

set_pattern(pattern)

Set the pattern to search with this Selector

Parameters:

Name Type Description Default
pattern Pattern

Regex pattern to search column names with.

required
Source code in recipies/selector.py
78
79
80
81
82
83
84
def set_pattern(self, pattern: re.Pattern):
    """Set the pattern to search with this Selector

    Args:
        pattern: Regex pattern to search column names with.
    """
    self.pattern = pattern

set_roles(roles)

Set the column roles to select with this Selector

Parameters:

Name Type Description Default
roles Union[str, list[str]]

column roles to select, see also Ingredients

required
Source code in recipies/selector.py
61
62
63
64
65
66
67
def set_roles(self, roles: Union[str, list[str]]):
    """Set the column roles to select with this Selector

    Args:
        roles: column roles to select, see also Ingredients
    """
    self.roles = enlist_str(roles)

set_types(roles)

Set the column data types to select with this Selector

Parameters:

Name Type Description Default
roles Union[str, list[str]]

column data types to select

required
Source code in recipies/selector.py
69
70
71
72
73
74
75
def set_types(self, roles: Union[str, list[str]]):
    """Set the column data types to select with this Selector

    Args:
        roles: column data types to select
    """
    self.types = enlist_str(roles)

all_groups()

Define selector for all grouping variables

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
334
335
336
337
338
339
340
def all_groups() -> Selector:
    """Define selector for all grouping variables

    Returns:
        Object representing the selection rule.
    """
    return Selector(description="all grouping variables", roles=["group"])

all_numeric_predictors(backend=Backend.POLARS)

Define selector for all numerical predictor columns

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
def all_numeric_predictors(backend=Backend.POLARS) -> Selector:
    """Define selector for all numerical predictor columns

    Returns:
        Object representing the selection rule.
    """
    sel = all_predictors()
    # if backend == Backend.POLARS:
    sel.set_types(
        ["Int8", "Int16", "Int32", "Int64", "Float32", "Float64", "int16", "int32", "int64", "float16", "float32", "float64"]
    )
    # else:
    #     sel.set_types([])
    sel.description = "all numeric predictors"
    return sel

all_of(names)

Define selector for any columns with one of the given names

Parameters:

Name Type Description Default
names Union[str, list[str]]

names to select

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
207
208
209
210
211
212
213
214
215
216
def all_of(names: Union[str, list[str]]) -> Selector:
    """Define selector for any columns with one of the given names

    Args:
        names: names to select

    Returns:
        Object representing the selection rule.
    """
    return Selector(description=str(names), names=names)

all_outcomes()

Define selector for all outcome columns

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
323
324
325
326
327
328
329
330
331
def all_outcomes() -> Selector:
    """Define selector for all outcome columns

    Returns:
        Object representing the selection rule.
    """
    sel = has_role(["outcome"])
    sel.description = "all outcomes"
    return sel

all_predictors()

Define selector for all predictor columns

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
295
296
297
298
299
300
301
302
303
def all_predictors() -> Selector:
    """Define selector for all predictor columns

    Returns:
        Object representing the selection rule.
    """
    sel = has_role(["predictor"])
    sel.description = "all predictors"
    return sel

all_sequences()

Define selector for all grouping variables

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
355
356
357
358
359
360
361
def all_sequences() -> Selector:
    """Define selector for all grouping variables

    Returns:
        Object representing the selection rule.
    """
    return Selector(description="all sequence variables", roles=["sequence"])

contains(substring)

Define selector for any columns where the name contains the substring

Parameters:

Name Type Description Default
substring str

substring to search for

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
256
257
258
259
260
261
262
263
264
265
def contains(substring: str) -> Selector:
    """Define selector for any columns where the name contains the substring

    Args:
        substring: substring to search for

    Returns:
        Object representing the selection rule.
    """
    return regex_names(f"{substring}")

ends_with(suffix)

Define selector for any columns where the name ends with the suffix

Parameters:

Name Type Description Default
suffix str

suffix to search for

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
244
245
246
247
248
249
250
251
252
253
def ends_with(suffix: str) -> Selector:
    """Define selector for any columns where the name ends with the suffix

    Args:
        suffix: suffix to search for

    Returns:
        Object representing the selection rule.
    """
    return regex_names(f"{suffix}$")

enlist_dt(x)

Wrap a pl datatype in a list if it isn't a list yet

Parameters:

Name Type Description Default
x Union[DataType, list[DataType], None]

object to wrap.

required

Raises:

Type Description
TypeError

If neither a datatype nor a list of datatypes is passed

Returns:

Type Description
Union[list[DataType], None]

description

Source code in recipies/selector.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def enlist_dt(x: Union[DataType, list[DataType], None]) -> Union[list[DataType], None]:
    """Wrap a pl datatype in a list if it isn't a list yet

    Args:
        x: object to wrap.

    Raises:
        TypeError: If neither a datatype nor a list of datatypes is passed

    Returns:
        _description_
    """
    if (
        isinstance(x, DataType)
        or (isinstance(x, type) and issubclass(x, DataType))
        or isinstance(x, pl.datatypes.DataTypeClass)
    ):
        return [x]
    elif isinstance(x, list):
        if not all(
            isinstance(x, DataType)
            or (isinstance(x, type) and issubclass(x, DataType))
            or isinstance(x, pl.datatypes.DataTypeClass)
            for x in x
        ):
            raise TypeError("Only lists of datatypes are allowed.")
        return x
    elif x is None:
        return x
    else:
        raise TypeError(f"Expected a pl datatype, got {x.__class__}")

enlist_str(x)

Wrap a str in a list if it isn't a list yet

Parameters:

Name Type Description Default
x Union[str, list[str], None]

object to wrap.

required

Raises:

Type Description
TypeError

If neither a str nor a list of strings is passed

Returns:

Type Description
Union[list[str], None]

description

Source code in recipies/selector.py
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def enlist_str(x: Union[str, list[str], None]) -> Union[list[str], None]:
    """Wrap a str in a list if it isn't a list yet

    Args:
        x: object to wrap.

    Raises:
        TypeError: If neither a str nor a list of strings is passed

    Returns:
        _description_
    """
    if isinstance(x, str):
        return [x]
    elif isinstance(x, list):
        if not all(isinstance(i, str) for i in x):
            raise TypeError("Only lists of str are allowed.")
        return x
    elif x is None:
        return x
    else:
        raise TypeError(f"Expected str or list of str, got {x.__class__}")

has_role(roles)

Define selector for any columns with one of the given roles

Parameters:

Name Type Description Default
roles Union[str, list[str]]

roles to select

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
268
269
270
271
272
273
274
275
276
277
def has_role(roles: Union[str, list[str]]) -> Selector:
    """Define selector for any columns with one of the given roles

    Args:
        roles: roles to select

    Returns:
       Object representing the selection rule.
    """
    return Selector(description=f"roles: {roles}", roles=roles)

has_type(types)

Define selector for any columns with one of the given types

Parameters:

Name Type Description Default
types Union[str, list[str]]

data types to select

required
Note

Data types are selected based on string representation as returned by df[[varname]].dtype.name.

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
280
281
282
283
284
285
286
287
288
289
290
291
292
def has_type(types: Union[str, list[str]]) -> Selector:
    """Define selector for any columns with one of the given types

    Args:
        types: data types to select

    Note:
        Data types are selected based on string representation as returned by `df[[varname]].dtype.name`.

    Returns:
        Object representing the selection rule.
    """
    return Selector(description=f"types: {types}", types=types)

intersection(x, y)

Intersection of two lists

Note

maintains the order of the first list does not deduplicate items (i.e., does not return a set)

Parameters:

Name Type Description Default
x list

first list

required
y list

second list

required

Returns:

Type Description
list

Elements in x that are also in y.

Source code in recipies/selector.py
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def intersection(x: list, y: list) -> list:
    """Intersection of two lists

    Note:
        maintains the order of the first list
        does not deduplicate items (i.e., does not return a set)

    Args:
        x: first list
        y: second list

    Returns:
        Elements in `x` that are also in `y`.
    """
    if isinstance(x, str):
        x = [x]
    if isinstance(y, str):
        y = [y]
    return [i for i in x if i in y]

regex_names(regex)

Define selector for any columns where the name matches the regex pattern

Parameters:

Name Type Description Default
regex str

string to be transformed to regex pattern to search for

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
219
220
221
222
223
224
225
226
227
228
229
def regex_names(regex: str) -> Selector:
    """Define selector for any columns where the name matches the regex pattern

    Args:
        regex: string to be transformed to regex pattern to search for

    Returns:
        Object representing the selection rule.
    """
    pattern = re.compile(regex)
    return Selector(description=f"regex: {regex}", pattern=pattern)

select_groups(ingr)

Select any grouping columns

Defines and directly applies Selector(roles=["group"])

Returns:

Type Description
list[str]

grouping columns

Source code in recipies/selector.py
343
344
345
346
347
348
349
350
351
352
def select_groups(ingr: Ingredients) -> list[str]:
    """Select any grouping columns

    Defines and directly applies Selector(roles=["group"])

    Returns:
        grouping columns
    """
    groups = all_groups()(ingr)
    return groups

select_sequence(ingr)

Select any sequence columns

Defines and directly applies Selector(roles=["sequence"])

Returns:

Type Description
list[str]

Grouping columns.

Source code in recipies/selector.py
364
365
366
367
368
369
370
371
372
def select_sequence(ingr: Ingredients) -> list[str]:
    """Select any sequence columns

    Defines and directly applies Selector(roles=["sequence"])

    Returns:
        Grouping columns.
    """
    return all_sequences()(ingr)

starts_with(prefix)

Define selector for any columns where the name starts with the prefix

Parameters:

Name Type Description Default
prefix str

prefix to search for

required

Returns:

Type Description
Selector

Object representing the selection rule.

Source code in recipies/selector.py
232
233
234
235
236
237
238
239
240
241
def starts_with(prefix: str) -> Selector:
    """Define selector for any columns where the name starts with the prefix

    Args:
        prefix: prefix to search for

    Returns:
        Object representing the selection rule.
    """
    return regex_names(f"^{prefix}")