I have a data frame 'df' that has missing column values. I want to fill in the missing/NaN values in the Avg Monthly Long Distance Charges column through prediction (regression) using the other column values. Then, replace the NaN values with the new values found.
I received the following error message when executing my code. Is there something that I am doing wrong?
Data frame: 'df'
Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Offer,Phone Service,Avg Monthly Long Distance Charges,Multiple Lines,Internet Service,Internet Type,Avg Monthly GB Download,Online Security,Online Backup,Device Protection Plan,Premium Tech Support,Streaming TV,Streaming Movies,Streaming Music,Unlimited Data,Contract,Paperless Billing,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,9,None,Yes,42.39,No,Yes,Cable,16,No,Yes,No,Yes,Yes,No,No,Yes,One Year,Yes,Credit Card,65.6,593.3,0,0,381.51,974.81,Stayed,,
0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,9,None,Yes,10.69,Yes,Yes,Cable,10,No,No,No,No,No,Yes,Yes,No,Month-to-Month,No,Credit Card,-4,542.4,38.33,10,96.21,610.28,Stayed,,
0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,4,Offer E,Yes,33.65,No,Yes,Fiber Optic,30,No,No,Yes,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,73.9,280.85,0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,13,Offer D,Yes,27.82,No,Yes,Fiber Optic,4,No,Yes,Yes,No,Yes,Yes,No,Yes,Month-to-Month,Yes,Bank Withdrawal,98,1237.85,0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,3,None,Yes,7.38,No,Yes,Fiber Optic,11,No,No,No,Yes,Yes,No,No,Yes,Month-to-Month,Yes,Credit Card,83.9,267.4,0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability
0013-MHZWF,Female,23,No,3,Midpines,95345,37.581496,-119.972762,0,9,Offer E,Yes,16.77,No,Yes,Cable,73,No,No,No,Yes,Yes,Yes,Yes,Yes,Month-to-Month,Yes,Credit Card,69.4,571.45,0,0,150.93,722.38,Stayed,,
0013-SMEOE,Female,67,Yes,0,Lompoc,93437,34.757477,-120.550507,1,71,Offer A,Yes,9.96,No,Yes,Fiber Optic,14,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Bank Withdrawal,109.7,7904.25,0,0,707.16,8611.41,Stayed,,
0014-BMAQU,Male,52,Yes,0,Napa,94558,38.489789,-122.27011,8,63,Offer B,Yes,12.96,Yes,Yes,Fiber Optic,7,Yes,No,No,Yes,No,No,No,No,Two Year,Yes,Credit Card,84.65,5377.8,0,20,816.48,6214.28,Stayed,,
0015-UOCOJ,Female,68,No,0,Simi Valley,93063,34.296813,-118.685703,0,7,Offer E,Yes,10.53,No,Yes,DSL,21,Yes,No,No,No,No,No,No,Yes,Two Year,Yes,Bank Withdrawal,48.2,340.35,0,0,73.71,414.06,Stayed,,
0016-QLJIS,Female,43,Yes,1,Sheridan,95681,38.984756,-121.345074,3,65,None,Yes,28.46,Yes,Yes,Cable,14,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,90.45,5957.9,0,0,1849.9,7807.8,Stayed,,
0017-DINOC,Male,47,No,0,Rancho Santa Fe,92091,32.99356,-117.207121,0,54,None,No,,,Yes,Cable,10,Yes,No,No,Yes,Yes,No,No,Yes,Two Year,No,Credit Card,45.2,2460.55,0,0,0,2460.55,Stayed,,
0017-IUDMW,Female,25,Yes,2,Sunnyvale,94086,37.378541,-122.020456,2,72,None,Yes,16.01,Yes,Yes,Fiber Optic,59,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two Year,Yes,Credit Card,116.8,8456.75,0,0,1152.72,9609.47,Stayed,,
0018-NYROU,Female,58,Yes,0,Antelope,95843,38.715498,-121.363411,0,5,None,Yes,18.65,No,Yes,Fiber Optic,10,No,No,No,No,No,No,No,Yes,Month-to-Month,Yes,Bank Withdrawal,68.95,351.5,0,0,93.25,444.75,Stayed,,
0019-EFAEP,Female,32,No,0,La Mesa,91942,32.782501,-117.01611,0,72,Offer A,Yes,2.25,Yes,Yes,Fiber Optic,16,Yes,Yes,Yes,No,Yes,No,No,Yes,Two Year,Yes,Bank Withdrawal,101.3,7261.25,0,0,162,7423.25,Stayed,,
0019-GFNTW,Female,39,No,0,Los Olivos,93441,34.70434,-120.02609,0,56,None,No,,,Yes,DSL,19,Yes,Yes,Yes,Yes,No,No,No,Yes,Two Year,No,Bank Withdrawal,45.05,2560.1,0,0,0,2560.1,Stayed,,
0020-INWCK,Female,58,Yes,2,Woodlake,93286,36.464635,-119.094348,9,71,Offer A,Yes,27.26,Yes,Yes,Fiber Optic,12,No,Yes,Yes,No,No,Yes,Yes,Yes,Two Year,Yes,Credit Card,95.75,6849.4,0,0,1935.46,8784.86,Stayed,,
0020-JDNXP,Female,52,Yes,1,Point Reyes Station,94956,38.060264,-122.830646,0,34,None,No,,,Yes,DSL,20,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,One Year,No,Credit Card,61.25,1993.2,0,0,0,1993.2,Stayed,,
0021-IKXGC,Female,72,No,0,San Marcos,92078,33.119028,-117.166036,0,1,Offer E,Yes,7.77,Yes,Yes,Fiber Optic,22,No,No,No,No,No,No,No,Yes,One Year,Yes,Bank Withdrawal,72.1,72.1,0,0,7.77,79.87,Joined,,
0022-TCJCI,Male,79,No,0,Daly City,94015,37.680844,-122.48131,0,45,None,Yes,10.67,No,Yes,DSL,17,Yes,No,Yes,No,No,Yes,No,Yes,One Year,No,Credit Card,62.7,2791.5,0,0,480.15,3271.65,Churned,Dissatisfaction,Limited range of services
My code:
# Let X = predictor variable and y = target variable
X2 = pd.DataFrame(df[['Monthly Charge', 'Total Revenue']])
y2 = pd.DataFrame(df[['Multiple Lines']])
# Add a constant variable to the predictor variables
X = sm.add_constant(X2)
model02 = sm.OLS(y2, X2).fit()
df['Multiple Lines'].fillna(sm.OLS(y2, X2).fit(), inplace=True)
Error Message: ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/wv/42dn23fd1cb0czpvqdnb6zw00000gn/T/ipykernel_15181/1879921247.py in <module>
5 # Add a constant variable to the predictor variables
6 X = sm.add_constant(X2)
----> 7 model02 = sm.OLS(y2, X2).fit()
8 df['Multiple Lines'].fillna(sm.OLS(y2, X2).fit(), inplace=True)
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
888 "An exception will be raised in the next version.")
889 warnings.warn(msg, ValueWarning)
--> 890 super(OLS, self).__init__(endog, exog, missing=missing,
891 hasconst=hasconst, **kwargs)
892 if "weights" in self._init_keys:
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
715 else:
716 weights = weights.squeeze()
--> 717 super(WLS, self).__init__(endog, exog, missing=missing,
718 weights=weights, hasconst=hasconst, **kwargs)
719 nobs = self.exog.shape[0]
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
189 """
190 def __init__(self, endog, exog, **kwargs):
--> 191 super(RegressionModel, self).__init__(endog, exog, **kwargs)
192 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
193
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
265
266 def __init__(self, endog, exog=None, **kwargs):
--> 267 super().__init__(endog, exog, **kwargs)
268 self.initialize()
269
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
90 missing = kwargs.pop('missing', 'none')
91 hasconst = kwargs.pop('hasconst', None)
---> 92 self.data = self._handle_data(endog, exog, missing, hasconst,
93 **kwargs)
94 self.k_constant = self.data.k_constant
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
130
131 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 132 data = handle_data(endog, exog, missing, hasconst, **kwargs)
133 # kwargs arrays could have changed, easier to just attach here
134 for key in kwargs:
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
671
672 klass = handle_data_class_factory(endog, exog)
--> 673 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
674 **kwargs)
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
80 self.orig_endog = endog
81 self.orig_exog = exog
---> 82 self.endog, self.exog = self._convert_endog_exog(endog, exog)
83
84 self.const_idx = None
~/opt/miniconda3/lib/python3.9/site-packages/statsmodels/base/data.py in _convert_endog_exog(self, endog, exog)
505 exog = exog if exog is None else np.asarray(exog)
506 if endog.dtype == object or exog is not None and exog.dtype == object:
--> 507 raise ValueError("Pandas data cast to numpy dtype of object. "
508 "Check input data with np.asarray(data).")
509 return super(PandasData, self)._convert_endog_exog(endog, exog)
ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).