Why pickling a sklearn decision tree can generate a pickle thousands times bigger (in terms of memory) than the original estimator?
I ran into this issue at work where a random forest estimator (with 100 decision trees) over a dataset with around 1_000_000 samples and 7 features generated a pickle bigger than 2GB.
I was able to track down the issue to the pickling of a single decision tree and I was able to replicate the issue with a generated dataset as below.
For memory estimations I used pympler library. Sklearn version used is 1.0.1
# here using a regressor tree but I would expect the same issue to be present with a classification tree
import pickle
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_friedman1 # using a dataset generation function from sklear
from pympler import asizeof
# function that creates the dataset and trains the estimator
def make_example(n_samples: int):
X, y = make_friedman1(n_samples=n_samples, n_features=7, noise=1.0, random_state=49)
estimator = DecisionTreeRegressor(max_depth=50, max_features='auto', min_samples_split=5)
estimator.fit(X, y)
return X, y, estimator
# utilities to compute and compare the size of an object and its pickled version
def readable_size(size_in_bytes: int, suffix='B') -> str:
num = size_in_bytes
for unit in ['', 'k', 'M', 'G', 'T', 'P', 'E', 'Z']:
if abs(num) < 1024.0:
return "%3.1f %s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def print_size(obj, skip_detail=False):
obj_size = asizeof.asized(obj).size
print(readable_size(obj_size))
return obj_size
def compare_with_pickle(obj):
size_obj = print_size(obj)
size_pickle = print_size(pickle.dumps(obj))
print(f"Ratio pickle/obj: {(size_pickle / size_obj):.2f}")
_, _, model100K = make_example(100_000)
compare_with_pickle(model100K)
_, _, model1M = make_example(1_000_000)
compare_with_pickle(model1M)
output:
1.7 kB
4.9 MB
Ratio pickle/obj: 2876.22
1.7 kB
49.3 MB
Ratio pickle/obj: 28982.84