I created a script to load data, check NA values, and fill all NA values. Here is my code:
import pandas as pd
def filter_df(merged_df, var_list):
ind = merged_df.Name.isin(var_list)
return merged_df[ind]
def pivot_df(df):
return df.pivot(index='Date', columns='Name', values=['Open', 'High', 'Low', 'Close'])
def validation_df(input, summary = False):
df = input.copy()
# na check
missing = df.isna().sum().sort_values(ascending=False)
percent_missing = ((missing / df.isnull().count()) * 100).sort_values(ascending=False)
missing_df = pd.concat([missing, percent_missing], axis=1, keys=['Total', 'Percent'], sort=False)
# fill na
columns = list(missing_df[missing_df['Total'] >= 1].reset_index()['index'])
for col in columns:
null_index = df.index[df[col].isnull() == True].tolist()
null_index.sort()
for ind in null_index:
if ind > 0:
print(df.loc[ind, col])
print(df.loc[ind - 1, col])
df.loc[ind, col] = df.loc[ind - 1, col]
if ind == 0:
df.loc[ind, col] = 0
# outliers check
count = []
for col in df.columns:
count.append(sum(df[col] > df[col].mean() + 2 * df[col].std()) + sum(df[col] < df[col].mean() - 2 * df[col].std()))
outliers_df = pd.DataFrame({'Columns': df.columns, 'Count': count}).sort_values(by = 'Count')
if summary == True:
print('missing value check:/n')
print(missing_df)
print('/n outliers check:/n')
print(outliers_df)
return df
def join_df(price_df, transaction_df, var_list):
price_df = filter_df(price_df, var_list)
price_df = pivot_df(price_df)
joined_df = transaction_df.merge(price_df, how = 'left', on = 'Date')
#joined_df = validation_df(joined_df)
return joined_df
token_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/1_token_df.csv'
transaction_path = 'https://raw.githubusercontent.com/Carloszone/Cryptocurrency_Research_project/main/datasets/transaction_df.csv'
var_list = ['Bitcoin', 'Ethereum', 'Golem', 'Solana']
token_df = pd.read_csv(token_path)
transaction_df = pd.read_csv(transaction_path)
df = join_df(token_df, transaction_df, var_list)
df = validation_df(df)
But it did not work. I checked my code and found this issue came from the loc(). For example:
df = join_df(token_df, transaction_df, var_list)
print(df[df.columns[15]])
print(df.loc[1,df.columns[15]])
what I got is:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
..
2250 NaN
2251 NaN
2252 NaN
2253 NaN
2254 NaN
Name: (High, Solana), Length: 2255, dtype: float64
AssertionError Traceback (most recent call last)
<ipython-input-19-75f01cc22c9c> in <module>()
2
3 print(df[df.columns[15]])
----> 4 print(df.loc[1,df.columns[15]])
2 frames
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
923 with suppress(KeyError, IndexError):
924 return self.obj._get_value(*key, takeable=self._takeable)
--> 925 return self._getitem_tuple(key)
926 else:
927 # we by definition only have the 0th axis
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
1107 return self._multi_take(tup)
1108
-> 1109 return self._getitem_tuple_same_dim(tup)
1110
1111 def _get_label(self, label, axis: int):
/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in _getitem_tuple_same_dim(self, tup)
807 # We should never have retval.ndim < self.ndim, as that should
808 # be handled by the _getitem_lowerdim call above.
--> 809 assert retval.ndim == self.ndim
810
811 return retval
AssertionError:
I don't know why df[column_name] is available, but df.loc[index,columns_name] is wrong.
You can check my code on Colab: https://colab.research.google.com/drive/1Yg280JRwFayW1tdp4OJqTO5-X3dGsItB?usp=sharing