import os
import pandas as pd
def concat_df(file_list):
df = pd.DataFrame()
for f in file_list:
tmp_df = pd.read_csv(f)
df = pd.concat((df, tmp_df), axis=0)
return df
root_dir = 'ce survey'
if os.path.exists('total.csv'):
os.remove('total.csv')
years = list(range(2012, 2022))
for year in years:
print(year)
dir_path = os.path.join(root_dir, str(year))
files = os.listdir(dir_path)
expd_files = []
fmld_files = []
fmli_files = []
memd_files = []
for file in files:
if file.endswith('.csv'):
fp = os.path.join(dir_path, file)
if 'expd' in file:
expd_files.append(fp)
elif 'fmld' in file:
fmld_files.append(fp)
elif 'fmli' in file:
fmli_files.append(fp)
elif 'memd' in file:
fmli_files.append(fp)
expd_df = concat_df(expd_files)
fmld_df = concat_df(fmld_files)
fmli_df = concat_df(fmli_files)
memd_df = concat_df(memd_files)
expd_df = expd_df.loc[expd_df['UCC'] == 180620 ][['NEWID', 'COST', [['UCC']=290420,320130,410901], 'EXPNYR']]
if year == 2015 or year == 2016 or year == 2017:
fmld_df = fmld_df[['NEWID', 'REGION', 'FWAGEX', 'INC_RANK', 'HISP_REF', 'HORREF1', 'HORREF2', 'RACE2', 'REF_RACE', 'EDUC_REF', 'FAM_SIZE', 'INCLASS','FAM_TYPE','CHILDAGE',]]
elif year == 2018 or year == 2019 or year == 2020 or year == 2021:
fmld_df = fmld_df[
['NEWID', 'REGION', 'FWAGEX', 'INC_RANK', 'HISP_REF', 'HORREF1', 'HORREF2', 'RACE2', 'REF_RACE', 'EDUC_REF', 'FAM_SIZE','FAM_TYPE','CHILDAGE',]]
else:
fmld_df = fmld_df[['NEWID', 'REGION', 'FWAGEX', 'INC_RANK', 'HISP_REF', 'HORREF1', 'HORREF2', 'RACE2', 'REF_RACE', 'EDUC_REF', 'FAM_SIZE', 'FINCAFTX', 'INCLASS']]
if year == 2012:
fmli_df = fmli_df[['NEWID', 'INCLASS2', 'RACE2']]
else:
fmli_df = fmli_df[['NEWID', 'INCLASS2', 'RACE2', 'FINATXEM']]
memd_df = memd_df[['NEWID','CU_CODE1','EMPLTYPE', 'SEX', 'OCCUEARN', 'OCCULIST', 'AGE', 'WKS_WRKD', 'MARITAL']]
expd_df = pd.merge(left=expd_df, right=fmld_df, on='NEWID', how='left')
expd_df = pd.merge(left=expd_df, right=fmli_df, on='NEWID', how='left')
expd_df = pd.merge(left=expd_df, right=memd_df, on='NEWID', how='left')
if year == 2012:
expd_df['FINATXEM'] = ''
if year == 2015 or year == 2016 or year == 2017:
expd_df['FINCAFTX'] = ''
elif year == 2018 or year == 2019 or year == 2020 or year == 2021:
expd_df['INCLASS'] = ''
expd_df['FINCAFTX'] = ''
expd_df['YEAR'] = year
expd_df = expd_df[['NEWID', 'COST', 'UCC', 'EXPNYR', 'REGION', 'FWAGEX', 'INC_RANK',
'HISP_REF', 'HORREF1', 'HORREF2', 'RACE2_x', 'REF_RACE', 'EDUC_REF',
'FAM_SIZE', 'FINCAFTX', 'INCLASS', 'INCLASS2', 'RACE2_y', 'FINATXEM']]
if year == 2012:
expd_df.to_csv('total.csv', mode='a', index=False)
else:
expd_df.to_csv('total.csv', mode='a', index=False, header=None)
I'm trying to get information to see how other factors could influence the cost of 180620. 290420,320130,410901 are in the same column with 180620. I tried to use 180620 as the condition to filter other information and 290420,320130,410901 should be contained as other information. In the sheet of expd, UCC is the expenses of NEWIDS, different UCC code could be paired with same newid. I and trying to find NEWIDS paired with 180620 first and get the cost of 180620, then see if the NEWIDS' cost on 290420,320130,410901. In the end, merge all the information from different files into one CSV sheet.
Thank you for your time to look at this. This problem bothers me for really a long time.