Note: this doesn’t seem to be a problem for the Ames dataset, but it can be a problem for other datasets.

from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import fetch_openml

ames_housing = fetch_openml(data_id=41211, as_frame=True)
data = ames_housing.data
target = ames_housing.target

data.head()
/home/halgoz/work/udacity-c6/c6/content/modules/M1/.venv/lib/python3.12/site-packages/sklearn/datasets/_openml.py:1035: UserWarning: Version 1 of dataset ames-housing is inactive, meaning that issues have been found in the dataset. Try using a newer version from this URL: https://openml.org/data/v1/download/20649135/ames-housing.arff
  warn(
MS_SubClass MS_Zoning Lot_Frontage Lot_Area Street Alley Lot_Shape Land_Contour Utilities Lot_Config ... Pool_QC Fence Misc_Feature Misc_Val Mo_Sold Year_Sold Sale_Type Sale_Condition Longitude Latitude
0 One_Story_1946_and_Newer_All_Styles Residential_Low_Density 141 31770 Pave No_Alley_Access Slightly_Irregular Lvl AllPub Corner ... No_Pool No_Fence None 0 5 2010 WD Normal -93.619754 42.054035
1 One_Story_1946_and_Newer_All_Styles Residential_High_Density 80 11622 Pave No_Alley_Access Regular Lvl AllPub Inside ... No_Pool Minimum_Privacy None 0 6 2010 WD Normal -93.619756 42.053014
2 One_Story_1946_and_Newer_All_Styles Residential_Low_Density 81 14267 Pave No_Alley_Access Slightly_Irregular Lvl AllPub Corner ... No_Pool No_Fence Gar2 12500 6 2010 WD Normal -93.619387 42.052659
3 One_Story_1946_and_Newer_All_Styles Residential_Low_Density 93 11160 Pave No_Alley_Access Regular Lvl AllPub Corner ... No_Pool No_Fence None 0 4 2010 WD Normal -93.617320 42.051245
4 Two_Story_1946_and_Newer Residential_Low_Density 74 13830 Pave No_Alley_Access Slightly_Irregular Lvl AllPub Inside ... No_Pool Minimum_Privacy None 0 3 2010 WD Normal -93.638933 42.060899

5 rows × 80 columns

data.info()
<class 'pandas.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 80 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   MS_SubClass         2930 non-null   category
 1   MS_Zoning           2930 non-null   category
 2   Lot_Frontage        2930 non-null   int64   
 3   Lot_Area            2930 non-null   int64   
 4   Street              2930 non-null   category
 5   Alley               2930 non-null   category
 6   Lot_Shape           2930 non-null   category
 7   Land_Contour        2930 non-null   category
 8   Utilities           2930 non-null   category
 9   Lot_Config          2930 non-null   category
 10  Land_Slope          2930 non-null   category
 11  Neighborhood        2930 non-null   category
 12  Condition_1         2930 non-null   category
 13  Condition_2         2930 non-null   category
 14  Bldg_Type           2930 non-null   category
 15  House_Style         2930 non-null   category
 16  Overall_Qual        2930 non-null   category
 17  Overall_Cond        2930 non-null   category
 18  Year_Built          2930 non-null   int64   
 19  Year_Remod_Add      2930 non-null   int64   
 20  Roof_Style          2930 non-null   category
 21  Roof_Matl           2930 non-null   category
 22  Exterior_1st        2930 non-null   category
 23  Exterior_2nd        2930 non-null   category
 24  Mas_Vnr_Type        2930 non-null   category
 25  Mas_Vnr_Area        2930 non-null   int64   
 26  Exter_Qual          2930 non-null   category
 27  Exter_Cond          2930 non-null   category
 28  Foundation          2930 non-null   category
 29  Bsmt_Qual           2930 non-null   category
 30  Bsmt_Cond           2930 non-null   category
 31  Bsmt_Exposure       2930 non-null   category
 32  BsmtFin_Type_1      2930 non-null   category
 33  BsmtFin_SF_1        2930 non-null   int64   
 34  BsmtFin_Type_2      2930 non-null   category
 35  BsmtFin_SF_2        2930 non-null   int64   
 36  Bsmt_Unf_SF         2930 non-null   int64   
 37  Total_Bsmt_SF       2930 non-null   int64   
 38  Heating             2930 non-null   category
 39  Heating_QC          2930 non-null   category
 40  Central_Air         2930 non-null   category
 41  Electrical          2930 non-null   category
 42  First_Flr_SF        2930 non-null   int64   
 43  Second_Flr_SF       2930 non-null   int64   
 44  Low_Qual_Fin_SF     2930 non-null   int64   
 45  Gr_Liv_Area         2930 non-null   int64   
 46  Bsmt_Full_Bath      2930 non-null   int64   
 47  Bsmt_Half_Bath      2930 non-null   int64   
 48  Full_Bath           2930 non-null   int64   
 49  Half_Bath           2930 non-null   int64   
 50  Bedroom_AbvGr       2930 non-null   int64   
 51  Kitchen_AbvGr       2930 non-null   int64   
 52  Kitchen_Qual        2930 non-null   category
 53  TotRms_AbvGrd       2930 non-null   int64   
 54  Functional          2930 non-null   category
 55  Fireplaces          2930 non-null   int64   
 56  Fireplace_Qu        2930 non-null   category
 57  Garage_Type         2930 non-null   category
 58  Garage_Finish       2930 non-null   category
 59  Garage_Cars         2930 non-null   int64   
 60  Garage_Area         2930 non-null   int64   
 61  Garage_Qual         2930 non-null   category
 62  Garage_Cond         2930 non-null   category
 63  Paved_Drive         2930 non-null   category
 64  Wood_Deck_SF        2930 non-null   int64   
 65  Open_Porch_SF       2930 non-null   int64   
 66  Enclosed_Porch      2930 non-null   int64   
 67  Three_season_porch  2930 non-null   int64   
 68  Screen_Porch        2930 non-null   int64   
 69  Pool_Area           2930 non-null   int64   
 70  Pool_QC             2930 non-null   category
 71  Fence               2930 non-null   category
 72  Misc_Feature        2930 non-null   category
 73  Misc_Val            2930 non-null   int64   
 74  Mo_Sold             2930 non-null   int64   
 75  Year_Sold           2930 non-null   int64   
 76  Sale_Type           2930 non-null   category
 77  Sale_Condition      2930 non-null   category
 78  Longitude           2930 non-null   float64 
 79  Latitude            2930 non-null   float64 
dtypes: category(46), float64(2), int64(32)
memory usage: 913.2 KB
categorical_columns = data.select_dtypes(include=['str', 'category']).columns.tolist()
cat_data = data[categorical_columns]
cat_data.head()
MS_SubClass MS_Zoning Street Alley Lot_Shape Land_Contour Utilities Lot_Config Land_Slope Neighborhood ... Garage_Type Garage_Finish Garage_Qual Garage_Cond Paved_Drive Pool_QC Fence Misc_Feature Sale_Type Sale_Condition
0 One_Story_1946_and_Newer_All_Styles Residential_Low_Density Pave No_Alley_Access Slightly_Irregular Lvl AllPub Corner Gtl North_Ames ... Attchd Fin Typical Typical Partial_Pavement No_Pool No_Fence None WD Normal
1 One_Story_1946_and_Newer_All_Styles Residential_High_Density Pave No_Alley_Access Regular Lvl AllPub Inside Gtl North_Ames ... Attchd Unf Typical Typical Paved No_Pool Minimum_Privacy None WD Normal
2 One_Story_1946_and_Newer_All_Styles Residential_Low_Density Pave No_Alley_Access Slightly_Irregular Lvl AllPub Corner Gtl North_Ames ... Attchd Unf Typical Typical Paved No_Pool No_Fence Gar2 WD Normal
3 One_Story_1946_and_Newer_All_Styles Residential_Low_Density Pave No_Alley_Access Regular Lvl AllPub Corner Gtl North_Ames ... Attchd Fin Typical Typical Paved No_Pool No_Fence None WD Normal
4 Two_Story_1946_and_Newer Residential_Low_Density Pave No_Alley_Access Slightly_Irregular Lvl AllPub Inside Gtl Gilbert ... Attchd Fin Typical Typical Paved No_Pool Minimum_Privacy None WD Normal

5 rows × 46 columns

ncols_before = cat_data.shape[1]
mem_before = cat_data.memory_usage(deep=True).sum() / 1024**2  # MB
encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown='ignore',
)

encoder.set_output(transform='pandas')

cat_data_encoded = encoder.fit_transform(data[categorical_columns])
ncols_after = cat_data_encoded.shape[1]
mem_after = cat_data_encoded.memory_usage(deep=True).sum() / 1024**2  # MB
print(f"Columns Before: {ncols_before}")
print(f"Columns After: {ncols_after}")
print(f"Memory Before: {mem_before:.2f} MB")
print(f"Memory After: {mem_after:.2f} MB")
Columns Before: 46
Columns After: 318
Memory Before: 0.15 MB
Memory After: 7.11 MB