I am stuck at this point in my code. I am trying to divide the startdate and enddate into multiple rows based on months and for that I am trying to use the resample function to sample the dates on monthly basis. The sample code looks like this-
PS- A lot of the BCA_REF, STARTDATE, ENDDATE values are repeated and are not unique owing to the usecase
df = pd.DataFrame(
data = [['abc','2018-08-01','2025-07-31'], ['abc','2018-08-01','2025-07-31'],['xyz','2017-04-01','2017-04-01'], ['xyz','2017-04-01','2017-04-01'], ['pqr','2016-05-16','2017-10-15']],
columns = ['BCA_REF', 'STARTDATE', 'ENDDATE']
)
df['STARTDATE'] = pd.to_datetime(df['STARTDATE'])
df['ENDDATE'] = pd.to_datetime(df['ENDDATE'])
df_start_end = df.melt(id_vars=['BCA_REF'],value_vars=['STARTDATE','ENDDATE'], value_name='date')
df_new = (
df_start_end.groupby(['BCA_REF'])
.apply(lambda x: x.drop_duplicates('date').set_index('date')
.resample('M').pad())
.drop(columns=['BCA_REF','variable'])
.reset_index()
)
After I run this for 40K such rows, it gives me the following error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_15069/2048245652.py in <module>
4 merged_final_new = (
5 mf_start_end.groupby(['BCA_REF'])
----> 6 .apply(lambda x: x.drop_duplicates('date').set_index('date')
7 .resample('M').pad())
8 # .drop(columns=['BCA_REF','variable'])
~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
1273 with option_context("mode.chained_assignment", None):
1274 try:
-> 1275 result = self._python_apply_general(f, self._selected_obj)
1276 except TypeError:
1277 # gh-20949
~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
1307 data after applying f
1308 """
-> 1309 keys, values, mutated = self.grouper.apply(f, data, self.axis)
1310
1311 return self._wrap_applied_output(
~/.local/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
850 # group might be modified
851 group_axes = group.axes
--> 852 res = f(group)
853 if not _is_indexed_like(res, group_axes, axis):
854 mutated = True
/tmp/ipykernel_15069/2048245652.py in <lambda>(x)
5 mf_start_end.groupby(['BCA_REF'])
6 .apply(lambda x: x.drop_duplicates('date').set_index('date')
----> 7 .resample('M').pad())
8 # .drop(columns=['BCA_REF','variable'])
9 # .reset_index()
~/.local/lib/python3.7/site-packages/pandas/core/resample.py in pad(self, limit)
507 DataFrame.fillna: Fill NA/NaN values using the specified method.
508 """
--> 509 return self._upsample("pad", limit=limit)
510
511 ffill = pad
~/.local/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
1204 else:
1205 result = obj.reindex(
-> 1206 res_index, method=method, limit=limit, fill_value=fill_value
1207 )
1208
~/.local/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
322 @wraps(func)
323 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324 return func(*args, **kwargs)
325
326 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4770 kwargs.pop("axis", None)
4771 kwargs.pop("labels", None)
-> 4772 return super().reindex(**kwargs)
4773
4774 @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])
~/.local/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4817 # perform the reindex on the axes
4818 return self._reindex_axes(
-> 4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
4821
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4596 if index is not None:
4597 frame = frame._reindex_index(
-> 4598 index, method, copy, level, fill_value, limit, tolerance
4599 )
4600
~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
4612 ):
4613 new_index, indexer = self.index.reindex(
-> 4614 new_index, method=method, level=level, limit=limit, tolerance=tolerance
4615 )
4616 return self._reindex_with_indexers(
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
3824 if self._index_as_unique:
3825 indexer = self.get_indexer(
-> 3826 target, method=method, limit=limit, tolerance=tolerance
3827 )
3828 else:
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
3484 )
3485
-> 3486 return self._get_indexer(target, method, limit, tolerance)
3487
3488 def _get_indexer(
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_indexer(self, target, method, limit, tolerance)
3506
3507 if method in ["pad", "backfill"]:
-> 3508 indexer = self._get_fill_indexer(target, method, limit, tolerance)
3509 elif method == "nearest":
3510 indexer = self._get_nearest_indexer(target, limit, tolerance)
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer(self, target, method, limit, tolerance)
3582 indexer = engine_method(target_values, limit)
3583 else:
-> 3584 indexer = self._get_fill_indexer_searchsorted(target, method, limit)
3585 if tolerance is not None and len(self):
3586 indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance)
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer_searchsorted(self, target, method, limit)
3606 indexer = self.get_indexer(target)
3607 nonexact = indexer == -1
-> 3608 indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
3609 if side == "left":
3610 # searchsorted returns "indices into a sorted array such that,
~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _searchsorted_monotonic(self, label, side)
5763 return len(self) - pos
5764
-> 5765 raise ValueError("index must be monotonic increasing or decreasing")
5766
5767 def get_slice_bound(self, label, side: str_t, kind=None) -> int:
ValueError: index must be monotonic increasing or decreasing
I tried to look for solutions for this error wherein people suggested using sort_index()/sort_values() for the 'date' column but it still does not work. I believe the issue is with the resample function.
Any help would be appreciated. Thank you