0

I have daily data of 40 years in a pandas dataframe with columns[Index = Date, Data] and I would like to extract the data from each month with cumalative sum of the data contained in days in the order showed in the code below, that means I have to repeat that code 12 times (thats for each month).

I would like to know if there is a more efficient way to code with out having to repeat many times the same.

def datos_por_dias(precipitacion):

    datos_final = precipitacion


    datos_enero1 = np.array([])
    datos_enero2 = np.array([])
    datos_enero3 = np.array([])
    datos_enero4 = np.array([])


    comienzo = time.time()

    for i in groupsY.year:
        datos_enero1 = np.append([[datos_enero1]], [[np.sum(datos_final[(datos_final.index.day <= 15) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero1 = np.append([[datos_enero1]], [[np.sum(datos_final[(datos_final.index.day > 15) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])


    final = time.time()
    print(final - comienzo)

    comienzo = time.time()
    for i in groupsY.year:
        datos_enero2 = np.append([[datos_enero2]], [[np.sum(datos_final[(datos_final.index.day <= 8) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero2 = np.append([[datos_enero2]], [[np.sum(datos_final[(datos_final.index.day > 8) & (datos_final.index.day <= 15) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero2 = np.append([[datos_enero2]], [[np.sum(datos_final[(datos_final.index.day > 15) & (datos_final.index.day <= 23) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero2 = np.append([[datos_enero2]], [[np.sum(datos_final[(datos_final.index.day > 23) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])

    final = time.time()
    print(final - comienzo)


    comienzo = time.time()
    for i in groupsY.year:
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day <= 4) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 4) & (datos_final.index.day <= 8) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 8) & (datos_final.index.day <= 12) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 12) & (datos_final.index.day <= 16) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 16) & (datos_final.index.day <= 20) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 20) & (datos_final.index.day <= 24) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 24) & (datos_final.index.day <= 28) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero3 = np.append([[datos_enero3]], [[np.sum(datos_final[(datos_final.index.day > 28) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])

    final = time.time()
    print(final - comienzo)


    comienzo = time.time()
    for i in groupsY.year:
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day <= 2) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 2) & (datos_final.index.day <= 4) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 4) & (datos_final.index.day <= 6) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 6) & (datos_final.index.day <= 8) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 8) & (datos_final.index.day <= 10) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 10) & (datos_final.index.day <= 12) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 12) & (datos_final.index.day <= 14) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 14) & (datos_final.index.day <= 16) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 18) & (datos_final.index.day <= 20) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 22) & (datos_final.index.day <= 24) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 26) & (datos_final.index.day <= 28) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])
        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 28) & (datos_final.index.day <= 30) & (datos_final.index.month==1) & (datos_final.index.year==i)])]])

        datos_enero4 = np.append([[datos_enero4]], [[np.sum(datos_final[(datos_final.index.day > 30)  & (datos_final.index.month==1) & (datos_final.index.year==i)])]])

    final = time.time()
    print(final - comienzo)

    pE = datos_final.loc[(datos_final.index.month==1)]


    return (datos_enero1, datos_enero2, datos_enero3, datos_enero4, 
            pE)

A fragment of the data is shown below

[Clip of daily data][1]

This is my data

The year data corresponds to the data in the for loop i.e. groupsY DataFrame for i in groupsY.year

Clip of groupsY

And i would like to get the result shown below (this is for the first for loop i.e. splits the month in 2 parts and makes a cumulative sum of the data in each part)

Result wanted

Thanks for your help.

Andy M
  • 15
  • 5
  • PResumably your data is more than the year column shown? What does your data look like now, and what would you like it to look like as the outcome? – ako May 23 '20 at 04:53
  • Thanks for your reply, the year data corresponds to the data in the for loop i.e. groupsY DataFrame ```for i in groupsY.year:``` My data looks like shown below Date `Date Data 1/1/1962 54 1/2/1962 20 1/3/1962 0 1/4/1962 0 1/5/1962 0 1/6/1962 0 1/7/1962 2 1/8/1962 0 1/9/1962 0 1/10/1962 0 1/11/1962 0 1/12/1962 5 1/13/1962 0 1/14/1962 0 1/15/1962 49 1/16/1962 0 1/17/1962 0 1/18/1962 0 1/19/1962 0 1/20/1962 0 1/21/1962 0 1/22/1962 8 1/23/1962 0 1/24/1962 0 1/25/1962 0 1/26/1962 0 1/27/1962 0 1/28/1962 0 1/29/1962 0 1/30/1962 0 1/31/1962 0 2/1/1962 9 2/2/1962 1` – Andy M May 23 '20 at 13:14

1 Answers1

0

Here is a way that could get you started - your function appears to return a tuple of data, but what you pasted looks different, so you will still need to work on the exact format you need.

# create mock data

index=pd.DatetimeIndex(start='1/1/1962',end='1/1/1966',freq='D')

data=pd.DataFrame(data={'Datos':np.random.choice(range(11),size=index.shape[0]),'Date':index})
Date    Datos

data.head()
0   1962-01-01  7
1   1962-01-02  3
2   1962-01-03  0
3   1962-01-04  7
4   1962-01-05  9

Then I extract key date components (year, month, day) - I noticed you split your months in chuncks before and after the 16th, including February, a shorter month, so I just split the month on the 16th day.

data['day']=data.Date.dt.day
data['year']=data.Date.dt.year
data['month']=data.Date.dt.month
data['DateGroup']=(data.day<17).map({True:'First Half',False:'Second Half'})

Then, your cumulative sum looks to just be the sum for each half month group, so we sum by year, month and DateGroup and get:

data.groupby(['year','month','DateGroup']).Datos.sum().head(6)

year  month  DateGroup  
1962  1      First Half      83
             Second Half     73
      2      First Half     100
             Second Half     37
      3      First Half      81
             Second Half     71
ako
  • 3,569
  • 4
  • 27
  • 38
  • Did this help answer your question, @AndyM ? – ako May 25 '20 at 04:27
  • It did help but in `data['DateGroup']=(data.day<17).map({True:'First Half',False:'Second Half'})` how can i split in more than 2 halfs. Can I put more split parameters like in this `(datos_final.index.day > 4) & (datos_final.index.day <= 8)` case? Thanks for your help. – Andy M May 26 '20 at 22:34
  • you can use `pd.cut` with dates directly, but you can also say `pd.cut(data.day,bins=[0,5,15,25,32])` to just bin on the day, for example. – ako May 28 '20 at 18:41