I'm trying to mitigate unfairness for a model I trained using an imblearn pipeline with ADASYN. My pipeline looks like this:
loaded_model = Pipeline(steps=[('feature_scaler', StandardScaler()),
('adasyn_resampling', ADASYN(random_state=123),
('low_variance_remover', VarianceThreshold(threshold=0.01)),
('feature_selection', SelectFromModel(estimator=RandomForestClassifier(random_state=0), max_features=35)),
('classifier', CalibratedClassifierCV(base_estimator=LogisticRegression(C=0.1, penalty='l1', random_state=0, solver='saga'), method='isotonic'))])
When I try to run the ExponentiatedGradient:
from fairlearn.reductions import EqualizedOdds, ExponentiatedGradient
exponentiated_gradient = ExponentiatedGradient(
estimator=loaded_model,
constraints=EqualizedOdds(difference_bound=0.01),
sample_weight_name="classifier__sample_weight")
exponentiated_gradient.fit(X_train.drop(columns=['PCa-ID']), y_train, sensitive_features=sf_train)
I get this error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_2882607/3174663212.py in <module>
32 sample_weight_name="classifier__sample_weight")
33
---> 34 exponentiated_gradient.fit(X_train.drop(columns=['PCa-ID']), y_train, sensitive_features=sf_train)
~/anaconda3/lib/python3.9/site-packages/fairlearn/reductions/_exponentiated_gradient/exponentiated_gradient.py in fit(self, X, y, **kwargs)
166
167 # select classifier according to best_h method
--> 168 h, h_idx = lagrangian.best_h(lambda_vec)
169
170 if t == 0:
~/anaconda3/lib/python3.9/site-packages/fairlearn/reductions/_exponentiated_gradient/_lagrangian.py in best_h(self, lambda_vec)
232 the vector of Lagrange multipliers `lambda_vec`.
233 """
--> 234 classifier = self._call_oracle(lambda_vec)
235
236 def h(X):
~/anaconda3/lib/python3.9/site-packages/fairlearn/reductions/_exponentiated_gradient/_lagrangian.py in _call_oracle(self, lambda_vec)
220
221 oracle_call_start_time = time()
--> 222 estimator.fit(self.constraints.X, redY, **{self.sample_weight_name: redW})
223 self.oracle_execution_times.append(time() - oracle_call_start_time)
224 self.n_oracle_calls += 1
~/anaconda3/lib/python3.9/site-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
295 if self._final_estimator != "passthrough":
296 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 297 self._final_estimator.fit(Xt, yt, **fit_params_last_step)
298 return self
299
~/anaconda3/lib/python3.9/site-packages/sklearn/calibration.py in fit(self, X, y, sample_weight)
269 X, y = indexable(X, y)
270 if sample_weight is not None:
--> 271 sample_weight = _check_sample_weight(sample_weight, X)
272
273 if self.base_estimator is None:
~/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py in _check_sample_weight(sample_weight, X, dtype, copy)
1563
1564 if sample_weight.shape != (n_samples,):
-> 1565 raise ValueError(
1566 "sample_weight.shape == {}, expected {}!".format(
1567 sample_weight.shape, (n_samples,)
ValueError: sample_weight.shape == (4834,), expected (6967,)!
I imagine this is due to the resampling step. Does anyone have a solution?