Slicing rolling expanding windows over multiple pandas objects
This is a fast way to yield a subset of rows from multiple Pandas dataframes or Series, when one needs to work on a sliding window basis over a predefined minimum and maximum number of rows. This approach is among the fastest available and is based on the .iloc
accessor of both series and dataframes.
def rolling_expanding_window(seq, n_min, n_max):
"""
Emits the elements over a rolling or expanding window of an iterable sequence
Parameters
----------
seq
n_min
n_max
Returns
-------
"""
it = iter(range(len(seq))) # makes it iterable
# roll it forward at least warmup steps
win = deque((next(it, None) for _ in range(n_min)), maxlen=n_max)
# yield win
for e in it:
win.append(e)
yield win
def sliding_dataframes(*arrays, n_min=0, n_max=None):
"""
Like scikit-learn train_test_split but with rolling-expanding window
Parameters
----------
*arrays: one or more pandas dataframes or series that we want to slice over in parallel
n_min: minimum window size
n_max: maximum window size
Returns
-------
"""
n_dataframes = len(arrays)
if n_dataframes == 0:
raise ValueError("At least one array required as input")
n_samples = arrays[0].shape[0]
for df in arrays:
if not isinstance(df, (pd.DataFrame, pd.Series)) and df is not None:
raise TypeError("This method only supports pandas dataframes and series")
if df is not None and df.shape[0] != n_samples:
raise ValueError("Specify equal length dataframes or series")
# the first dataframe is the one dictating
indices = rolling_expanding_window(arrays[0], n_min=n_min, n_max=n_max)
for index in indices:
yield list(
chain.from_iterable(
(a.iloc[index] if a is not None else None,) for a in arrays
)
)