-1

I have two datasets df_population_by_age (has estimated population proportion by sex and age) and df_population_bracket (has actual population per age group). The idea is to use the estimated proportions from df_population_age, to calculate the individual ages for 12, 13 and 14 year olds from the 10-14 age group/bracket. The two dataframes can be created using the code below:

import pandas as pd

data_age = {'Year':[2019,2019,2019,2019,2019,2020,2020,2020,2020,2020,2021,2021,2021,2021,2021,2022,2022,2022,2022,2022],
        'GROUP':[10,11,12,13,14,10,11,12,13,14,10,11,12,13,14,10,11,12,13,14],
        'Male Pop %':[20.81734529,20.40093945,19.92488209,19.53475796,19.3220752,20.78257696,20.4430872,20.03334197,19.5642673,19.17672657,20.60812359,20.41674418,20.08270641,19.678726,19.21369982,20.17031461,20.36601678,20.17660054,19.84519833,19.44186975,],
        'Female Pop %':[20.86138492,20.4243707,19.94312394,19.53373523,19.23738521,20.82046158,20.45422644,20.02531257,19.55234472,19.1476547,20.54653854,20.46182048,20.10165403,19.67898124,19.2110057,20.11753111,20.31743763,20.23351925,19.87628941,19.4552226,]}

df_population_by_age = pd.DataFrame(data_age)

data_age_bracket = {'district':['EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
'EC - Alfred Nzo District Municipality (DC44)','EC - Alfred Nzo District Municipality (DC44)',
],
'Sex':['Female','Female','Female','Female','Male','Male','Male','Male','Female','Female',
'Female','Female','Male','Male','Male','Male','Female','Female','Female','Female','Male',
'Male','Male','Male','Female','Female','Female','Female','Male','Male','Male','Male',
],
'Age':['0-4','10-14','15-17','5-9','0-4','10-14','15-17','5-9','0-4','10-14','15-17','5-9','0-4','10-14','15-17',
'5-9','0-4','10-14','15-17','5-9','0-4','10-14','15-17','5-9','0-4','10-14','15-17','5-9','0-4','10-14','15-17','5-9',
],
'month of estimation':['2019/07/31','2019/07/31','2019/07/31','2019/07/31','2019/07/31','2019/07/31','2019/07/31','2019/07/31',
'2020/07/31','2020/07/31','2020/07/31','2020/07/31','2020/07/31','2020/07/31','2020/07/31','2020/07/31',
'2021/07/31','2021/07/31','2021/07/31','2021/07/31','2021/07/31','2021/07/31','2021/07/31','2021/07/31',
'2022/07/31','2022/07/31','2022/07/31','2022/07/31','2022/07/31','2022/07/31','2022/07/31','2022/07/31'
],
'population':[56012.07456,54977.51262,26545.14284,56929.65597,56429.27585,53654.3515,25290.17196,
56809.85359,55146.23784,55907.47753,27366.36564,57057.8536,55620.29277,54631.38735,25924.10209,
56819.59782,54417.37763,56739.82262,28461.41523,56754.49009,55016.92924,55483.59369,27088.30327,
56421.42684,53695.71713,56414.07898,28784.89557,56469.46606,54304.69991,55249.91613,26799.95513,56052.97587,
],
'Year':[2019,2019,2019,2019,2019,2019,2019,2019,2020,2020,2020,2020,2020,2020,2020,2020,
2021,2021,2021,2021,2021,2021,2021,2021,2022,2022,2022,2022,2022,2022,2022,2022,
]}
df_population_bracket = pd.DataFrame(data_age_bracket)

A previous data scientist created 3 hardcoded functions (for ages 12, 13, 14) to multiply the proportion within the 10-14 age group and map the calculated population to their own columns. The functions and the calculations are shown below:

def function_check_age_12(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 12 year olds for each Sex
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.924882/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.943124/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(20.033342/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(20.025313/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(20.082706/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(20.101654/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(20.176601/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(20.233519/100)
    else: 
        return df['population']
        
def function_check_age_13(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 13 year olds for each Sex
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.534758/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.533735/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(19.564267/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(19.552345/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(19.678726/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(19.678981/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(19.845198/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(19.876289/100)
    else: 
        return df['population']

def function_check_age_14(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 14 year olds for each Sex
    """
    if df['Sex'] == 'Male' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.322075/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2019 and df['Age'] == '10-14':
        return df['population']*(19.237385/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(19.176727/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2020 and df['Age'] == '10-14':
        return df['population']*(19.147655/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(19.213700/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2021 and df['Age'] == '10-14':
        return df['population']*(19.211006/100)
    elif df['Sex'] == 'Male' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(19.441870/100)
    elif df['Sex'] == 'Female' and df['Year'] == 2022 and df['Age'] == '10-14':
        return df['population']*(19.455223/100)
    else: 
        return df['population']
        
df_population_bracket['population_12'] = df_population_bracket.apply(lambda row: function_check_age_12(row), axis=1)
df_population_bracket['population_13'] = df_population_bracket.apply(lambda row: function_check_age_13(row), axis=1)
df_population_bracket['population_14'] = df_population_bracket.apply(lambda row: function_check_age_14(row), axis=1)

The functions work but do not actually iterate through the values within the df_population_by_age dataframe. How can I refactor the code so it performs the same functionality without the hardcoding.

Mazil_tov998
  • 396
  • 1
  • 13

1 Answers1

0

You can simplify your functions with equations for the points (based on the equations given from https://www.had2know.org/academics/cubic-through-4-points.html using the year and hardcoded values)

import math

def function_check_age_12(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 12 year olds for each Sex
    """
    if df['Age'] != '10-14' or df['Year'] not in [2019, 2020, 2021, 2022]: # guard clause to limit number of comparisons done in if statements
        return df['population']

    if df['Sex'] == 'Male':
        return df['population']* (((-0.00001*math.pow(df['Year'], 3)) + (0.07017*math.pow(df['Year'], 2)) + (-141.76891*df['Year']) + 95493.68332) / 100)
    elif df['Sex'] == 'Female':
        return df['population']* (((-0.00001*math.pow(df['Year'], 3)) + (0.04155*math.pow(df['Year'], 2)) + (-83.91622*df['Year']) + 56515.28286) / 100)
    else: 
        return df['population']
        
def function_check_age_13(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 13 year olds for each Sex
    """
    if df['Age'] != '10-14' or df['Year'] not in [2019, 2020, 2021, 2022]: # guard clause to limit number of comparisons done in if statements
        return df['population']

    if df['Sex'] == 'Male':
        return df['population']* (((0*math.pow(df['Year'], 3)) + (-0.02232*math.pow(df['Year'], 2)) + (45.16703*df['Year']) + -30442.32296) / 100)
    elif df['Sex'] == 'Female':
        return df['population']* (((0*math.pow(df['Year'], 3)) + (-0.02532*math.pow(df['Year'], 2)) + (51.23819*df['Year']) + -34540.75603) / 100)
    else: 
        return df['population']

def function_check_age_14(df):
    """
    This function is checking the population of the age group 10-14 for the years 
    2019, 2020, 2021, and 2022 then mapping the proportion for 14 year olds for each Sex
    """
    if df['Age'] != '10-14' or df['Year'] not in [2019, 2020, 2021, 2022]: # guard clause to limit number of comparisons done in if statements
        return df['population']

    if df['Sex'] == 'Male':
        return df['population']* (((0*math.pow(df['Year'], 3)) + (0.00595*math.pow(df['Year'], 2)) + (-11.87752*df['Year']) + 7923.10051) / 100)
    elif df['Sex'] == 'Female':
        return df['population']* (((0*math.pow(df['Year'], 3)) + (0.01876*math.pow(df['Year'], 2)) + (-37.77381*df['Year']) + 25373.24964) / 100)
    else: 
        return df['population']
        
df_population_bracket['population_12'] = df_population_bracket.apply(lambda row: function_check_age_12(row), axis=1)
df_population_bracket['population_13'] = df_population_bracket.apply(lambda row: function_check_age_13(row), axis=1)
df_population_bracket['population_14'] = df_population_bracket.apply(lambda row: function_check_age_14(row), axis=1)

Though this approach still leaves some hardcoded values, but this gives the least amount of hardcoded values available, considering that each equation is different.

  • assumed that the values were on a singular plane to make an equation.
Andrew Ryan
  • 1,489
  • 3
  • 15
  • 21