Source code for recipies.recipe

from __future__ import annotations
from collections import Counter
from copy import copy
from itertools import chain
from typing import Union

import pandas as pd
import polars as pl

from .ingredients import Ingredients
from .selector import select_groups
from .step import Step
from .constants import Backend


[docs] class Recipe: """Recipe for preprocessing data A Recipe object combines a pandas-like Ingredients object with one or more sklearn-inspired transformation Steps to turn into a model-ready input. Args: data: data to be preprocessed. outcomes: names of columns in data that are assigned the 'outcome' role predictors: names of columns in data that should be assigned the 'predictor' role groups: names of columns in data that should be assigned the 'group' role sequence: names of columns in data that should be assigned the 'sequence' role """ columns = None roles = None
[docs] def __init__( self, data: Ingredients | pl.DataFrame | pd.DataFrame, outcomes: Union[str, list[str]] = None, predictors: Union[str, list[str]] = None, groups: Union[str, list[str]] = None, sequences: Union[str, list[str]] = None, backend: Backend = None, ): if not isinstance(data, Ingredients): try: data = Ingredients(data, backend=backend) except Exception as e: raise (f"Expected Ingredients, got {data.__class__} {e}") self.data = data self.steps = [] self.original_columns = copy(data.columns) self.roles = self.data.roles self.columns = self.data.columns if outcomes: self.update_roles(outcomes, "outcome") if predictors: self.update_roles(predictors, "predictor") if groups: self.update_roles(groups, "group") if sequences: self.update_roles(sequences, "sequence")
[docs] def add_roles(self, vars: Union[str, list[str]], new_role: str = "predictor") -> Recipe: """Adds an additional role for one or more columns of the Recipe's Ingredients. Args: vars: The column to receive additional roles. new_role: Defaults to predictor. The role to add to the column. See also: Ingredients.add_role() Returns: self """ if isinstance(vars, str): vars = [vars] for v in vars: self.data.add_role(v, new_role) return self
[docs] def update_roles(self, vars: Union[str, list[str]], new_role: str = "predictor", old_role: str = None) -> Recipe: """Adds a new role for one or more columns of the Recipe's Ingredients without roles or changes an existing role to a different one. Args: vars: The column to receive additional roles. new_role: Defaults to predictor'. The role to add or change to. old_role: Defaults to None. The role to be changed. See also: Ingredients.update_role() Returns: self """ if isinstance(vars, str): vars = [vars] for v in vars: self.data.update_role(v, new_role, old_role) return self
[docs] def add_step(self, step: Step) -> Recipe: """Adds a new step to the Recipe Args: step: a transformation step that should be applied to the Ingredients during prep() and bake() Returns: self """ self.steps.append(step) return self
def _check_data(self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients]) -> Ingredients: if data is None: data = self.data elif isinstance(data, pl.DataFrame) or isinstance(data, pd.DataFrame): # this is only executed when prep or bake receive a DF that is different to the original data # don't check the roles here, because self.data can have more roles than data (post feature generation) data = Ingredients(data, roles=self.data.roles, check_roles=False) # if not data.columns.equals(self.data.columns): if not set(data.columns) == set(self.original_columns): raise ValueError( f"Columns of data argument differs from recipe data: " f"{[x for x in data.columns if x not in self.original_columns]}." ) return data def _apply_group(self, data, step): if step.group: group_vars = select_groups(data) if len(group_vars) > 0: data.groupby(group_vars) return data
[docs] def prep( self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None, refit: bool = False ) -> pl.DataFrame | pd.DataFrame: """Fits and transforms, in other words preps, the data. Args: data: Data to fit and transform. Defaults to None. refit: Defaults to False. Whether to refit data. Returns: Transformed data. """ data = self._check_data(data) # Todo: check why the roles dissapear after copying data = copy(data) data = self._apply_fit_transform(data, refit) # return pl.DataFrame(data) return data.get_df()
[docs] def bake(self, data: Union[pl.DataFrame | pd.DataFrame, Ingredients] = None) -> pl.DataFrame | pd.DataFrame: """Transforms, or bakes, the data if it has been prepped. Args: data: Data to transform. Defaults to None. Returns: Transformed data. """ data = self._check_data(data) # original_data = deepcopy(data) data = self._apply_fit_transform(data) # return pl.DataFrame(data) return data.get_df()
def _apply_fit_transform(self, data=None, refit=False): # applies transform or fit and transform (when refit or not trained yet) for step in self.steps: data = self._apply_group(data, step) if refit or not step.trained: data = step.fit_transform(data) else: data = step.transform(data) return data def __repr__(self): repr = "Recipe\n\n" # Print all existing roles and how many variables are assigned to each num_roles = Counter(chain.from_iterable(self.data.roles.values())) num_roles = pl.DataFrame({"role": [r for r in num_roles.keys()], "#variables": [n for n in num_roles.values()]}) repr += "Inputs:\n\n" + num_roles.__repr__() + "\n\n" # Print all steps repr += "Operations:\n\n" for step in self.steps: repr += str(step) + "\n" return repr
[docs] def get_backend(self): return self.data.get_backend()
[docs] def cache(self): """Prepares the recipe for caching""" if self.data is not None: del self.data return self