python - Transforming targets with inputs in a scikit-learn pipeline -


i have written custom transformer object introduces time-delayed embedding of signal. want include hyper-parameters gridsearchcv. trouble requires post-processing step drop rows null values introduced pandas shift operation. understand it, transformers not designed process targets, although have found way make work. question is, legitimate approach or there less hacky way of doing this?

the time delay embedding transformer

from pandas import series, dataframe pandas.tools.merge import concat sklearn.base import transformermixin typing import union   class timedelayembedder(transformermixin):     """     transformer add time-delay embedding existing time history.     """     def __init__(self, min_delay: int, max_delay: int):         """         create new timedelayembedder.          :param min_delay: minimum delay use.         :param max_delay: maximum delay use.         """         self._min_delay = min(min_delay, max_delay)         self._max_delay = max(min_delay, max_delay)      def fit(self, x=none, y=none):          return self      def transform(self, x: union[series, dataframe], y: series=none):         """         transform input data `x`. returned dataframe have null values,         need handled separately e.g. nullrowdropper.          :param x: pandas series or dataframe of time histories.             dataframes, each column have history added.         :param y: targets. these unaffected.         :rtype: dataframe         """         if type(x) series:             assert x.name, 'input series must have name.'             x = x.to_frame(name=x.name)         series_list = []         column in x.columns:             time_step in range(self._min_delay, self._max_delay + 1):                 data = x[column].shift(time_step)                 if time_step < 0:                     name = '%s__t+%i' % (column, time_step)                 elif time_step == 0:                     name = '%s__t' % column                 else:                     name = '%s__t-%i' % (column, time_step)                 series_list.append(series(data=data, name=name))         df_out = concat(series_list, axis=1)          return df_out 

the null row dropping transformer is:

from pandas import dataframe, series sklearn.base import transformermixin   class nullrowdropper(transformermixin):     """     drop null rows data-set.     pass-through pandas dropna() drops associated rows on targets too.     """     def __init__(self, how='any'):         """         create new nullrowdropper.          :param how: `'any'` or `'all'`         """         assert how in ('any', 'all'), "param: 'how' must either 'any' or 'all'"         self._how = how      def fit(self, x: dataframe=none, y: series=none):          self._y = y  # save y here can modify index inplace later (y not passed transform)         return self      def transform(self, x: dataframe, y: series=none):         """         drop null rows according `how` param given @ initialisation.         :param x: input pandas dataframe.         :rtype: dataframe         """         x_out = x.dropna(how=self._how)         self._y.drop(set(self._y.index) - set(x_out.index), inplace=true)         return x.dropna(how=self._how) 


Comments

Popular posts from this blog

ubuntu - PHP script to find files of certain extensions in a directory, returns populated array when run in browser, but empty array when run from terminal -

php - How can i create a user dashboard -

javascript - How to detect toggling of the fullscreen-toolbar in jQuery Mobile? -