-3

Below is the code, however, it's very slow when dealing with large data. (maybe takes >1 days for a 5,000,000 rows, 6 columns dataframe.

Just wondering how could I optimise it? Many Thanks

def ewm(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
 
    lnf = [[None] * cols for i in range(rows)]
    for i in range(0, rows):         
        for j in range(0, cols):
            if df.iloc[i][j] == 0:
                lnfij = 0.0
            else:
                p = df.iloc[i][j] / df.iloc[:,j].sum()
                lnfij = math.log(p) * p * (-k)
            lnf[i][j] = lnfij
    lnf = pd.DataFrame(lnf)
 
    d = 1 - lnf.sum(axis=0)
    w = [[None] * 1 for i in range(cols)]
    for j in range(0, cols):
        wj = d[j] / sum(d)
        w[j] = wj
    
    w = pd.DataFrame(w)
    w = w.round(5)    #.applymap(lambda x:format(x,'.5f'))
    w.index = df.columns
    w.columns =['weight']
    return w
s666
  • 266
  • 1
  • 3
  • 14

2 Answers2

1

use iat instead of iloc when getting specific value and if you do the same iloc twice save it in a tmp

import pandas as pd
import time
import numpy as np
import math

#original method
def ewm(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
 
    lnf = [[None] * cols for i in range(rows)]
    for i in range(0, rows):         
        for j in range(0, cols):
            if df.iloc[i][j] == 0:
                lnfij = 0.0
            else:
                p = df.iloc[i][j] / df.iloc[:,j].sum()
                lnfij = math.log(p) * p * (-k)
            lnf[i][j] = lnfij
    lnf = pd.DataFrame(lnf)
 
    d = 1 - lnf.sum(axis=0)
    w = [[None] * 1 for i in range(cols)]
    for j in range(0, cols):
        wj = d[j] / sum(d)
        w[j] = wj
    
    w = pd.DataFrame(w)
    w = w.round(5)    #.applymap(lambda x:format(x,'.5f'))
    w.index = df.columns
    w.columns =['weight']
    return w


#modified method
def ewm1(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
 
    lnf = [[None] * cols for i in range(rows)]
    for i in range(0, rows):         
        for j in range(0, cols):
            tmp = df.iat[i,j]  #********************************* modified section
            if tmp == 0:
                lnfij = 0.0
            else:
                p = tmp / df.iloc[:,j].sum()  #************************ end of modified
                lnfij = math.log(p) * p * (-k)
            lnf[i][j] = lnfij
    lnf = pd.DataFrame(lnf)
 
    d = 1 - lnf.sum(axis=0)
    w = [[None] * 1 for i in range(cols)]
    for j in range(0, cols):
        wj = d[j] / sum(d)
        w[j] = wj
    
    w = pd.DataFrame(w)
    w = w.round(5)    #.applymap(lambda x:format(x,'.5f'))
    w.index = df.columns
    w.columns =['weight']
    return w



df = pd.DataFrame(np.random.rand(1000,6))
start = time.time()
ewm(df)
print(time.time()-start)

start1 = time.time()
ewm1(df)
print(time.time()-start1)

time for first func is 1.9747240543365479

for second its 0.820796012878418

i'm not sure what the method does but if you can break it in to few functions with numeric return value you can hash them and improve it much more

trigonom
  • 528
  • 4
  • 9
1

Having numpy do the loops should speed it up alot

import numpy as np
import pandas as pd

def ewm(df):
    df = df.apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x))))
    rows, cols = df.shape 
    k = 1.0 / math.log(rows)
    
    p = df / df.sum(axis=0)
    lnf = -np.log(p , where = df!=0 )*p*k
    
    d = 1 - lnf.sum(axis=0)
    w = d / d.sum()
    
    w = pd.DataFrame(w)
    w = w.round(5)
    w.index = df.columns
    w.columns =['weight']
    return w
jack
  • 61
  • 1