import pandas as pd
from fastai.tabular.all import *
TabularPandas AttributeError: classes
Specify procs
TabularPandas(..., procs = [Categorify])
when you have categorical columns
This post was written using:
- pandas
: 2.2.2
- fastai
: 2.7.15
# looking at unique values in each columns to split categorical / continuous features
for k in df.keys():
print(f"Column {k}:\n{Counter(df[k])}")
print()
Column a:
Counter({2.0: 8435, 1.0: 2580, 3.0: 2011})
Column b:
Counter({12.0: 9853, 32.0: 2911, 80.0: 87, 16.0: 72, 11.0: 47, 8.0: 28, 10.0: 25, 40.0: 3})
Column c:
Counter({0: 1579, 5: 1517, 6: 1271, 7: 1213, 4: 1211, 8: 1081, 3: 898, 9: 831, 10: 682, 2: 608, 1: 342, 11: 278, 14: 216, 12: 206, 16: 184, 13: 165, 15: 145, 17: 131, 18: 94, 19: 88, 20: 66, 21: 57, 23: 54, 29: 24, 26: 18, 24: 16, 27: 16, 22: 12, 30: 12, 25: 7, 28: 4})
Column d:
Counter({15.0: 7521, -1.0: 1433, 8.0: 374, 0.0: 372, 7.0: 365, 9.0: 334, 10.0: 319, 6.0: 316, 5.0: 272, 11.0: 264, 4.0: 241, 12.0: 220, 3.0: 217, 13.0: 205, 14.0: 176, 2.0: 170, 1.0: 164, -3.0: 14, -2.0: 4, -4.0: 3, -27.0: 3, -5.0: 3, -22.0: 2, -26.0: 2, -30.0: 2, -16.0: 2, -7.0: 2, -17.0: 2, -21.0: 1, -23.0: 1, -25.0: 1, -28.0: 1, -31.0: 1, -32.0: 1, -53.0: 1, -56.0: 1, -57.0: 1, -58.0: 1, -59.0: 1, -88.0: 1, -93.0: 1, -98.0: 1, -38.0: 1, -8.0: 1, -11.0: 1, -14.0: 1, -18.0: 1, -9.0: 1, -10.0: 1, -43.0: 1, -49.0: 1, -6.0: 1})
Column e:
Counter({4.1: 1285, 3.4: 912, 3.3: 905, 4.0: 884, 4.2: 812, 3.5: 797, 3.6: 713, 3.7: 708, 3.2: 666, 3.9: 640, 3.8: 628, 4.3: 512, 4.4: 359, 2.1: 351, 4.5: 294, 3.1: 276, 2.2: 269, 4.6: 216, 2.3: 185, 4.7: 175, 2.4: 145, 4.8: 142, 4.9: 135, 5.0: 130, 5.1: 90, 2.5: 88, 5.2: 72, 2.6: 66, 2.8: 64, 2.7: 64, 5.3: 58, 3.0: 54, 2.9: 50, 2.0: 38, 5.4: 34, 5.5: 28, 6.3: 22, 6.6: 21, 5.7: 16, 6.2: 16, 5.6: 16, 5.8: 15, 6.7: 14, 6.8: 12, 6.1: 12, 6.5: 9, 5.9: 8, 6.4: 6, 6.0: 4, 7.2: 4, 6.9: 3, 7.0: 1, 7.5: 1, 7.1: 1})
Column f:
Counter({0.7: 1475, 0.6: 1472, 0.8: 1444, 1.0: 1385, 0.9: 1335, 0.5: 1249, 0.4: 1014, 1.1: 974, 0.3: 708, 0.0: 482, 0.2: 423, 0.1: 294, 1.2: 241, 1.3: 119, 1.4: 81, 1.5: 56, 1.6: 37, 1.8: 33, 1.9: 25, -0.1: 23, 1.7: 22, -1.1: 19, 2.3: 15, 2.4: 12, 2.1: 9, -1.3: 9, -0.6: 8, -0.9: 8, -0.7: 7, 2.0: 7, -0.8: 5, -1.9: 5, -1.2: 4, 2.2: 4, -0.2: 4, 2.8: 3, 2.6: 2, 2.5: 2, -0.3: 2, -0.5: 2, 5.1: 1, -1.0: 1, 2.7: 1, 3.0: 1, -1.8: 1, -1.6: 1, -4.5: 1})
Column label:
Counter({0.0: 11000, 1.0: 2026})
# define categorical and continuous features
= ['a', 'b']
cat_names = 'label'
y_names = [c for c in df.keys() if c not in cat_names+[y_names]]
cont_names
print('cat_names:',cat_names)
print('cont_names:',cont_names)
print('y_names:',y_names)
cat_names: ['a', 'b']
cont_names: ['c', 'd', 'e', 'f']
y_names: label
# split into train and test
= list(df.sample(frac=0.2, random_state=0).index) # 20% from total df
val_index = list(df[~df.index.isin(val_index)].index)
train_index
assert (len([i for i in train_index if i in set(val_index)])==0
and len([i for i in val_index if i in set(train_index)])==0), 'train and val set are overlapping!'
print('train set len', len(train_index))
print('val set len', len(val_index))
train set len 10421
val set len 2605
Error Example
# oh no, can't train!
= TabularPandas(df,
dl =cat_names,
cat_names=cont_names,
cont_names=y_names,
y_names= CategoryBlock(vocab=df[y_names]),
y_block =(train_index, val_index))
splits
= dl.dataloaders(bs=64)
dls print(dls.show_batch())
= tabular_learner(dls, metrics=[accuracy])
learn 3) learn.fit_one_cycle(
a | b | c | d | e | f | label | |
---|---|---|---|---|---|---|---|
0 | 1.0 | 12.0 | 7.0 | 9.0 | 3.9 | 0.6 | 0.0 |
1 | 2.0 | 12.0 | 3.0 | 15.0 | 4.1 | 0.3 | 0.0 |
2 | 2.0 | 12.0 | 4.0 | -1.0 | 4.0 | 0.7 | 0.0 |
3 | 2.0 | 12.0 | 11.0 | 15.0 | 4.1 | 1.4 | 0.0 |
4 | 2.0 | 12.0 | 4.0 | 12.0 | 4.2 | 0.6 | 0.0 |
5 | 2.0 | 32.0 | 14.0 | 6.0 | 5.2 | 0.2 | 0.0 |
6 | 1.0 | 12.0 | 4.0 | 9.0 | 3.2 | 0.3 | 1.0 |
7 | 3.0 | 32.0 | 5.0 | 15.0 | 3.5 | 0.7 | 0.0 |
8 | 2.0 | 12.0 | 3.0 | 14.0 | 2.6 | 0.5 | 0.0 |
9 | 3.0 | 12.0 | 0.0 | -2.0 | 4.1 | -0.0 | 0.0 |
None
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[43], line 10 8 dls = dl.dataloaders(bs=64) 9 print(dls.show_batch()) ---> 10 learn = tabular_learner(dls, metrics=[accuracy]) 11 learn.fit_one_cycle(3) File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/learner.py:42, in tabular_learner(dls, layers, emb_szs, config, n_out, y_range, **kwargs) 40 if layers is None: layers = [200,100] 41 to = dls.train_ds ---> 42 emb_szs = get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs) 43 if n_out is None: n_out = get_c(dls) 44 assert n_out, "`n_out` is not defined, and could not be inferred from data, set `dls.c` or pass `n_out`" File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py:32, in get_emb_sz(to, sz_dict) 27 def get_emb_sz( 28 to:Tabular|TabularPandas, 29 sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 30 ) -> list: # List of embedding sizes for each category 31 "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict" ---> 32 return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names] File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py:32, in <listcomp>(.0) 27 def get_emb_sz( 28 to:Tabular|TabularPandas, 29 sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 30 ) -> list: # List of embedding sizes for each category 31 "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict" ---> 32 return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names] File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k) 505 if self._component_attr_filter(k): 506 attr = getattr(self,self._default,None) --> 507 if attr is not None: return getattr(attr,k) 508 raise AttributeError(k) File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:212, in Pipeline.__getattr__(self, k) --> 212 def __getattr__(self,k): return gather_attrs(self, k, 'fs') File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:173, in gather_attrs(o, k, nm) 171 att = getattr(o,nm) 172 res = [t for t in att.attrgot(k) if t is not None] --> 173 if not res: raise AttributeError(k) 174 return res[0] if len(res)==1 else L(res) AttributeError: classes
How to fix this?
I actually went down the rabbit hole and provided the emb_szs
manually as mentioned in the source code hinted by error message above, but there is actually an easier way – just add procs=[Categorify]
when initializing TabularPandas.
- In the source code,
emb_szs
is expected to be{'class_name' : size, ...}
. So for example if columna
is a categorical column in our df, thenemb_szs = {'a': len(unique value in column 'a')}
.
Working Example
# now we can train
= TabularPandas(df,
dl =cat_names,
cat_names=cont_names,
cont_names=y_names,
y_names= CategoryBlock(vocab=df[y_names]),
y_block =(train_index, val_index),
splits=[Categorify]) # <------ add procs!
procs
= dl.dataloaders(bs=64)
dls print(dls.show_batch())
= tabular_learner(dls, metrics=[accuracy])
learn 3) learn.fit_one_cycle(
a | b | c | d | e | f | label | |
---|---|---|---|---|---|---|---|
0 | 2.0 | 32.0 | 15.0 | 15.0 | 3.6 | 0.8 | 0.0 |
1 | 3.0 | 32.0 | 9.0 | 15.0 | 4.5 | 0.7 | 0.0 |
2 | 2.0 | 32.0 | 9.0 | 14.0 | 3.9 | 0.5 | 1.0 |
3 | 2.0 | 12.0 | 6.0 | -1.0 | 4.0 | 1.0 | 0.0 |
4 | 2.0 | 12.0 | 8.0 | 15.0 | 3.7 | 0.9 | 0.0 |
5 | 2.0 | 12.0 | 4.0 | 15.0 | 3.7 | 0.7 | 1.0 |
6 | 1.0 | 12.0 | 0.0 | 15.0 | 4.0 | 0.7 | 0.0 |
7 | 3.0 | 12.0 | 0.0 | 15.0 | 3.3 | 0.5 | 0.0 |
8 | 2.0 | 12.0 | 5.0 | 15.0 | 4.3 | 0.9 | 0.0 |
9 | 2.0 | 12.0 | 9.0 | 15.0 | 4.3 | 0.9 | 0.0 |
None
epoch | train_loss | valid_loss | accuracy | time |
---|---|---|---|---|
0 | 0.482110 | 0.418015 | 0.832246 | 00:09 |
1 | 0.349364 | 0.333562 | 0.852591 | 00:07 |
2 | 0.322106 | 0.323014 | 0.854511 | 00:07 |
Why did we get this error?
get_emb_sz??
Signature: get_emb_sz(to: 'Tabular | TabularPandas', sz_dict: 'dict' = None) -> 'list' Source: def get_emb_sz( to:Tabular|TabularPandas, sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` ) -> list: # List of embedding sizes for each category "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict" return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names] File: /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py Type: function
from fastai.tabular.model import _one_emb_sz
_one_emb_sz??
Signature: _one_emb_sz(classes, n, sz_dict=None) Source: def _one_emb_sz(classes, n, sz_dict=None): "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`." sz_dict = ifnone(sz_dict, {}) n_cat = len(classes[n]) sz = sz_dict.get(n, int(emb_sz_rule(n_cat))) # rule of thumb return n_cat,sz File: /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py Type: function
We see that the error is due to get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs)
line. The get_emb_sz
function tries to return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]
. We get error because our dataloaders has no classes
attributes .
Here, classes
attributes is what category do we have in each of our categorical columns. In simple_df
below, we would declare aa
column as categorical feature, with 3 separate classes [1, 2, 3]
. The learner doesn’t know this because we did not specify to Categorify
our categorical column when initializing our dataloaders.
= pd.DataFrame({'aa': [1, 2, 3, 1], 'bb':[1.1, 2.2, 3.3, 5.0], 'label':[1, 0, 1, 1]})
simple_df simple_df
aa | bb | label | |
---|---|---|---|
0 | 1 | 1.1 | 1 |
1 | 2 | 2.2 | 0 |
2 | 3 | 3.3 | 1 |
3 | 1 | 5.0 | 1 |
# no classes attributes
TabularPandas(simple_df, = ['aa'],
cat_names = ['bb'],
cont_names = ['label'],
y_names = CategoryBlock(vocab=simple_df[y_names]),
y_block = ([0,1,2], [3]),
splits =64).classes ).dataloaders(bs
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[32], line 7 1 TabularPandas(simple_df, 2 cat_names = ['aa'], 3 cont_names = ['bb'], 4 y_names = ['label'], 5 y_block = CategoryBlock(vocab=simple_df[y_names]), 6 splits = ([0,1,2], [3]), ----> 7 ).dataloaders(bs=64).classes File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k) 505 if self._component_attr_filter(k): 506 attr = getattr(self,self._default,None) --> 507 if attr is not None: return getattr(attr,k) 508 raise AttributeError(k) File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k) 505 if self._component_attr_filter(k): 506 attr = getattr(self,self._default,None) --> 507 if attr is not None: return getattr(attr,k) 508 raise AttributeError(k) File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k) 505 if self._component_attr_filter(k): 506 attr = getattr(self,self._default,None) --> 507 if attr is not None: return getattr(attr,k) 508 raise AttributeError(k) File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:212, in Pipeline.__getattr__(self, k) --> 212 def __getattr__(self,k): return gather_attrs(self, k, 'fs') File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:173, in gather_attrs(o, k, nm) 171 att = getattr(o,nm) 172 res = [t for t in att.attrgot(k) if t is not None] --> 173 if not res: raise AttributeError(k) 174 return res[0] if len(res)==1 else L(res) AttributeError: classes
# now we have classes attributes
TabularPandas(simple_df, = ['aa'],
cat_names = ['bb'],
cont_names = ['label'],
y_names = CategoryBlock(vocab=simple_df[y_names]),
y_block = ([0,1,2], [3]),
splits = [Categorify]
procs =64).classes ).dataloaders(bs
{'aa': ['#na#', 1, 2, 3]}
That’s all for now, bye!