2

I have a linear interpolation problem with nans in my data. I have instantaneous measurements that I want to resample from 6 min intervals to 5 min intervals.

df = pd.DataFrame(zip(['10:00','10:06','10:12','10:18','10:24'],
                     [1, 2, 3, 0.5, 2.5], [0, np.nan, 5, 2.5, 10]),
                 columns=['date','column_a','column_b'])
df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)
df = df.set_index('date')

print(df)

                    column_a  column_b
date                                   
2023-07-19 10:00:00       1.0       0.0
2023-07-19 10:06:00       2.0       NaN
2023-07-19 10:12:00       3.0       5.0
2023-07-19 10:18:00       0.5       2.5
2023-07-19 10:24:00       2.5      10.0

I used this code but at 10:05 there is supposed to be nan instead of value. Thanks for helping.

print(df.resample('1Min').interpolate(method='linear', limit=5).resample('5Min').asfreq())

                     column_a  column_b
date                                  
2023-07-19 10:00:00  1.000000  0.000000
2023-07-19 10:05:00  1.833333  2.083333 <--- here should be nan
2023-07-19 10:10:00  2.666667       NaN
2023-07-19 10:15:00  1.750000  3.750000
2023-07-19 10:20:00  1.166667  5.000000
pyaj
  • 545
  • 5
  • 15

2 Answers2

1

Here is another way to do it using Pandas get_indexer and concat:

dfs = []
for item in df.resample("1Min").agg(list).resample("5Min"):
    # Get slice of df around each "5Min" value
    slice = [
        df.index.get_indexer([item[1].index[0]], method=method)[0]
        for method in ("ffill", "bfill")
    ]
    tmp_df = pd.concat(
        [df.iloc[slice, :], pd.DataFrame(index=[item[1].index[0]])]
    ).sort_index()
    tmp_df = tmp_df[~tmp_df.index.duplicated(keep="first")]

    # Interpolate only if there is a value before and after "5Min" value
    if tmp_df.iloc[0].isna().any() or tmp_df.iloc[-1].isna().any():
        dfs.append(tmp_df.loc[item[1].index[0], :].to_frame().T)
        continue
    dfs.append(tmp_df.interpolate().loc[item[1].index[0], :].to_frame().T)

new_df = pd.concat(dfs)

Then:

print(new_df)
# Output

                     column_a  column_b
2023-07-23 10:00:00      1.00      0.00
2023-07-23 10:05:00       NaN       NaN
2023-07-23 10:10:00       NaN       NaN
2023-07-23 10:15:00      1.75      3.75
2023-07-23 10:20:00      1.50      6.25
Laurent
  • 12,287
  • 7
  • 21
  • 37
  • 1
    Hi Laurent, I am also thinking about interpolating for each sequence of 5 min and skipping if there is nan. But in your implementation, there is nan in the first column while the original data has values. – pyaj Jul 24 '23 at 07:18
  • But the easiest way to do this might just be to interpolate the data and then replace the nan rows with nan – pyaj Jul 24 '23 at 07:21
1

Here df_6min is df

df_5min = []

# create list of 5 minute values in an hour
list_min5 = list(range(0, 60, 5))

# for each column get pandas data series of 5 min interval resampled with linear interpolation
for column in df_6min.columns:
    ds_5min = df_6min[column].resample('1Min').interpolate(method='linear').resample('5Min').asfreq()

    # for each index of 6 min interval check if there is nan
    for i in df_6min.index:
        if np.isnan(df_6min.loc[i, column]) == True:

            # if yes replace upper and lower index of 5 min by nan  
            for j in range(len(list_min5)):

                if list_min5[j-1] < i.minute < list_min5[j]:
                    ds_5min.loc[i.replace(minute=list_min5[j-1])] = np.nan
                    ds_5min.loc[i.replace(minute=list_min5[j])] = np.nan

                # if index is 0 minute or 30 minute, get j+1 index instead of j
                if list_min5[j-1] == i.minute:
                    ds_5min.loc[i.replace(minute=list_min5[j-1])] = np.nan
                    ds_5min.loc[i.replace(minute=list_min5[j+1])] = np.nan

    df_5min.append(ds_5min)  

df_5min = pd.concat(df_5min, axis=1)
pyaj
  • 545
  • 5
  • 15