0

I am stuck at this point in my code. I am trying to divide the startdate and enddate into multiple rows based on months and for that I am trying to use the resample function to sample the dates on monthly basis. The sample code looks like this-

PS- A lot of the BCA_REF, STARTDATE, ENDDATE values are repeated and are not unique owing to the usecase

df = pd.DataFrame(
    data = [['abc','2018-08-01','2025-07-31'], ['abc','2018-08-01','2025-07-31'],['xyz','2017-04-01','2017-04-01'], ['xyz','2017-04-01','2017-04-01'], ['pqr','2016-05-16','2017-10-15']], 
    columns = ['BCA_REF', 'STARTDATE', 'ENDDATE'] 
)
df['STARTDATE'] = pd.to_datetime(df['STARTDATE'])
df['ENDDATE'] = pd.to_datetime(df['ENDDATE'])


df_start_end = df.melt(id_vars=['BCA_REF'],value_vars=['STARTDATE','ENDDATE'], value_name='date')


df_new = (
    df_start_end.groupby(['BCA_REF'])
    .apply(lambda x: x.drop_duplicates('date').set_index('date')
    .resample('M').pad())
    .drop(columns=['BCA_REF','variable'])
    .reset_index()
)

After I run this for 40K such rows, it gives me the following error

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_15069/2048245652.py in <module>
      4 merged_final_new = (
      5     mf_start_end.groupby(['BCA_REF'])
----> 6     .apply(lambda x: x.drop_duplicates('date').set_index('date')
      7     .resample('M').pad())
      8 #     .drop(columns=['BCA_REF','variable'])

~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
   1273         with option_context("mode.chained_assignment", None):
   1274             try:
-> 1275                 result = self._python_apply_general(f, self._selected_obj)
   1276             except TypeError:
   1277                 # gh-20949

~/.local/lib/python3.7/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f, data)
   1307             data after applying f
   1308         """
-> 1309         keys, values, mutated = self.grouper.apply(f, data, self.axis)
   1310 
   1311         return self._wrap_applied_output(

~/.local/lib/python3.7/site-packages/pandas/core/groupby/ops.py in apply(self, f, data, axis)
    850             # group might be modified
    851             group_axes = group.axes
--> 852             res = f(group)
    853             if not _is_indexed_like(res, group_axes, axis):
    854                 mutated = True

/tmp/ipykernel_15069/2048245652.py in <lambda>(x)
      5     mf_start_end.groupby(['BCA_REF'])
      6     .apply(lambda x: x.drop_duplicates('date').set_index('date')
----> 7     .resample('M').pad())
      8 #     .drop(columns=['BCA_REF','variable'])
      9 #     .reset_index()

~/.local/lib/python3.7/site-packages/pandas/core/resample.py in pad(self, limit)
    507         DataFrame.fillna: Fill NA/NaN values using the specified method.
    508         """
--> 509         return self._upsample("pad", limit=limit)
    510 
    511     ffill = pad

~/.local/lib/python3.7/site-packages/pandas/core/resample.py in _upsample(self, method, limit, fill_value)
   1204         else:
   1205             result = obj.reindex(
-> 1206                 res_index, method=method, limit=limit, fill_value=fill_value
   1207             )
   1208 

~/.local/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    322         @wraps(func)
    323         def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 324             return func(*args, **kwargs)
    325 
    326         kind = inspect.Parameter.POSITIONAL_OR_KEYWORD

~/.local/lib/python3.7/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   4770         kwargs.pop("axis", None)
   4771         kwargs.pop("labels", None)
-> 4772         return super().reindex(**kwargs)
   4773 
   4774     @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"])

~/.local/lib/python3.7/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4817         # perform the reindex on the axes
   4818         return self._reindex_axes(
-> 4819             axes, level, limit, tolerance, method, fill_value, copy
   4820         ).__finalize__(self, method="reindex")
   4821 

~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   4596         if index is not None:
   4597             frame = frame._reindex_index(
-> 4598                 index, method, copy, level, fill_value, limit, tolerance
   4599             )
   4600 

~/.local/lib/python3.7/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   4612     ):
   4613         new_index, indexer = self.index.reindex(
-> 4614             new_index, method=method, level=level, limit=limit, tolerance=tolerance
   4615         )
   4616         return self._reindex_with_indexers(

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in reindex(self, target, method, level, limit, tolerance)
   3824                 if self._index_as_unique:
   3825                     indexer = self.get_indexer(
-> 3826                         target, method=method, limit=limit, tolerance=tolerance
   3827                     )
   3828                 else:

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in get_indexer(self, target, method, limit, tolerance)
   3484             )
   3485 
-> 3486         return self._get_indexer(target, method, limit, tolerance)
   3487 
   3488     def _get_indexer(

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_indexer(self, target, method, limit, tolerance)
   3506 
   3507         if method in ["pad", "backfill"]:
-> 3508             indexer = self._get_fill_indexer(target, method, limit, tolerance)
   3509         elif method == "nearest":
   3510             indexer = self._get_nearest_indexer(target, limit, tolerance)

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer(self, target, method, limit, tolerance)
   3582             indexer = engine_method(target_values, limit)
   3583         else:
-> 3584             indexer = self._get_fill_indexer_searchsorted(target, method, limit)
   3585         if tolerance is not None and len(self):
   3586             indexer = self._filter_indexer_tolerance(target_values, indexer, tolerance)

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _get_fill_indexer_searchsorted(self, target, method, limit)
   3606         indexer = self.get_indexer(target)
   3607         nonexact = indexer == -1
-> 3608         indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side)
   3609         if side == "left":
   3610             # searchsorted returns "indices into a sorted array such that,

~/.local/lib/python3.7/site-packages/pandas/core/indexes/base.py in _searchsorted_monotonic(self, label, side)
   5763             return len(self) - pos
   5764 
-> 5765         raise ValueError("index must be monotonic increasing or decreasing")
   5766 
   5767     def get_slice_bound(self, label, side: str_t, kind=None) -> int:

ValueError: index must be monotonic increasing or decreasing

I tried to look for solutions for this error wherein people suggested using sort_index()/sort_values() for the 'date' column but it still does not work. I believe the issue is with the resample function.

Any help would be appreciated. Thank you

0 Answers0