TabularPandas AttributeError: classes

fastai
TabularPandas
error
python
Published

June 13, 2024

Specify procs TabularPandas(..., procs = [Categorify]) when you have categorical columns

Note

This post was written using:
- pandas: 2.2.2
- fastai: 2.7.15

import pandas as pd
from fastai.tabular.all import *
# looking at unique values in each columns to split categorical / continuous features
for k in df.keys():
    print(f"Column {k}:\n{Counter(df[k])}")
    print()
Column a:
Counter({2.0: 8435, 1.0: 2580, 3.0: 2011})

Column b:
Counter({12.0: 9853, 32.0: 2911, 80.0: 87, 16.0: 72, 11.0: 47, 8.0: 28, 10.0: 25, 40.0: 3})

Column c:
Counter({0: 1579, 5: 1517, 6: 1271, 7: 1213, 4: 1211, 8: 1081, 3: 898, 9: 831, 10: 682, 2: 608, 1: 342, 11: 278, 14: 216, 12: 206, 16: 184, 13: 165, 15: 145, 17: 131, 18: 94, 19: 88, 20: 66, 21: 57, 23: 54, 29: 24, 26: 18, 24: 16, 27: 16, 22: 12, 30: 12, 25: 7, 28: 4})

Column d:
Counter({15.0: 7521, -1.0: 1433, 8.0: 374, 0.0: 372, 7.0: 365, 9.0: 334, 10.0: 319, 6.0: 316, 5.0: 272, 11.0: 264, 4.0: 241, 12.0: 220, 3.0: 217, 13.0: 205, 14.0: 176, 2.0: 170, 1.0: 164, -3.0: 14, -2.0: 4, -4.0: 3, -27.0: 3, -5.0: 3, -22.0: 2, -26.0: 2, -30.0: 2, -16.0: 2, -7.0: 2, -17.0: 2, -21.0: 1, -23.0: 1, -25.0: 1, -28.0: 1, -31.0: 1, -32.0: 1, -53.0: 1, -56.0: 1, -57.0: 1, -58.0: 1, -59.0: 1, -88.0: 1, -93.0: 1, -98.0: 1, -38.0: 1, -8.0: 1, -11.0: 1, -14.0: 1, -18.0: 1, -9.0: 1, -10.0: 1, -43.0: 1, -49.0: 1, -6.0: 1})

Column e:
Counter({4.1: 1285, 3.4: 912, 3.3: 905, 4.0: 884, 4.2: 812, 3.5: 797, 3.6: 713, 3.7: 708, 3.2: 666, 3.9: 640, 3.8: 628, 4.3: 512, 4.4: 359, 2.1: 351, 4.5: 294, 3.1: 276, 2.2: 269, 4.6: 216, 2.3: 185, 4.7: 175, 2.4: 145, 4.8: 142, 4.9: 135, 5.0: 130, 5.1: 90, 2.5: 88, 5.2: 72, 2.6: 66, 2.8: 64, 2.7: 64, 5.3: 58, 3.0: 54, 2.9: 50, 2.0: 38, 5.4: 34, 5.5: 28, 6.3: 22, 6.6: 21, 5.7: 16, 6.2: 16, 5.6: 16, 5.8: 15, 6.7: 14, 6.8: 12, 6.1: 12, 6.5: 9, 5.9: 8, 6.4: 6, 6.0: 4, 7.2: 4, 6.9: 3, 7.0: 1, 7.5: 1, 7.1: 1})

Column f:
Counter({0.7: 1475, 0.6: 1472, 0.8: 1444, 1.0: 1385, 0.9: 1335, 0.5: 1249, 0.4: 1014, 1.1: 974, 0.3: 708, 0.0: 482, 0.2: 423, 0.1: 294, 1.2: 241, 1.3: 119, 1.4: 81, 1.5: 56, 1.6: 37, 1.8: 33, 1.9: 25, -0.1: 23, 1.7: 22, -1.1: 19, 2.3: 15, 2.4: 12, 2.1: 9, -1.3: 9, -0.6: 8, -0.9: 8, -0.7: 7, 2.0: 7, -0.8: 5, -1.9: 5, -1.2: 4, 2.2: 4, -0.2: 4, 2.8: 3, 2.6: 2, 2.5: 2, -0.3: 2, -0.5: 2, 5.1: 1, -1.0: 1, 2.7: 1, 3.0: 1, -1.8: 1, -1.6: 1, -4.5: 1})

Column label:
Counter({0.0: 11000, 1.0: 2026})
# define categorical and continuous features
cat_names = ['a', 'b']
y_names = 'label'
cont_names = [c for c in df.keys() if c not in cat_names+[y_names]]


print('cat_names:',cat_names)
print('cont_names:',cont_names)
print('y_names:',y_names)
cat_names: ['a', 'b']
cont_names: ['c', 'd', 'e', 'f']
y_names: label
# split into train and test
val_index = list(df.sample(frac=0.2, random_state=0).index) # 20% from total df
train_index = list(df[~df.index.isin(val_index)].index)

assert (len([i for i in train_index if i in set(val_index)])==0 
        and len([i for i in val_index if i in set(train_index)])==0), 'train and val set are overlapping!'

print('train set len', len(train_index))
print('val set len', len(val_index))
train set len 10421
val set len 2605

Error Example

# oh no, can't train!

dl = TabularPandas(df, 
                   cat_names=cat_names, 
                   cont_names=cont_names, 
                   y_names=y_names,
                   y_block = CategoryBlock(vocab=df[y_names]), 
                   splits=(train_index, val_index))

dls = dl.dataloaders(bs=64)
print(dls.show_batch())
learn = tabular_learner(dls, metrics=[accuracy])
learn.fit_one_cycle(3)
a b c d e f label
0 1.0 12.0 7.0 9.0 3.9 0.6 0.0
1 2.0 12.0 3.0 15.0 4.1 0.3 0.0
2 2.0 12.0 4.0 -1.0 4.0 0.7 0.0
3 2.0 12.0 11.0 15.0 4.1 1.4 0.0
4 2.0 12.0 4.0 12.0 4.2 0.6 0.0
5 2.0 32.0 14.0 6.0 5.2 0.2 0.0
6 1.0 12.0 4.0 9.0 3.2 0.3 1.0
7 3.0 32.0 5.0 15.0 3.5 0.7 0.0
8 2.0 12.0 3.0 14.0 2.6 0.5 0.0
9 3.0 12.0 0.0 -2.0 4.1 -0.0 0.0
None
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[43], line 10
      8 dls = dl.dataloaders(bs=64)
      9 print(dls.show_batch())
---> 10 learn = tabular_learner(dls, metrics=[accuracy])
     11 learn.fit_one_cycle(3)

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/learner.py:42, in tabular_learner(dls, layers, emb_szs, config, n_out, y_range, **kwargs)
     40 if layers is None: layers = [200,100]
     41 to = dls.train_ds
---> 42 emb_szs = get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs)
     43 if n_out is None: n_out = get_c(dls)
     44 assert n_out, "`n_out` is not defined, and could not be inferred from data, set `dls.c` or pass `n_out`"

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py:32, in get_emb_sz(to, sz_dict)
     27 def get_emb_sz(
     28     to:Tabular|TabularPandas, 
     29     sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 
     30 ) -> list: # List of embedding sizes for each category
     31     "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict"
---> 32     return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py:32, in <listcomp>(.0)
     27 def get_emb_sz(
     28     to:Tabular|TabularPandas, 
     29     sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 
     30 ) -> list: # List of embedding sizes for each category
     31     "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict"
---> 32     return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k)
    505 if self._component_attr_filter(k):
    506     attr = getattr(self,self._default,None)
--> 507     if attr is not None: return getattr(attr,k)
    508 raise AttributeError(k)

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:212, in Pipeline.__getattr__(self, k)
--> 212 def __getattr__(self,k): return gather_attrs(self, k, 'fs')

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:173, in gather_attrs(o, k, nm)
    171 att = getattr(o,nm)
    172 res = [t for t in att.attrgot(k) if t is not None]
--> 173 if not res: raise AttributeError(k)
    174 return res[0] if len(res)==1 else L(res)

AttributeError: classes

How to fix this?

I actually went down the rabbit hole and provided the emb_szs manually as mentioned in the source code hinted by error message above, but there is actually an easier way – just add procs=[Categorify] when initializing TabularPandas.

  • In the source code, emb_szs is expected to be {'class_name' : size, ...}. So for example if column a is a categorical column in our df, then emb_szs = {'a': len(unique value in column 'a')}.

Working Example

# now we can train

dl = TabularPandas(df, 
                   cat_names=cat_names, 
                   cont_names=cont_names, 
                   y_names=y_names,
                   y_block = CategoryBlock(vocab=df[y_names]), 
                   splits=(train_index, val_index),
                   procs=[Categorify])  # <------ add procs!

dls = dl.dataloaders(bs=64)
print(dls.show_batch())
learn = tabular_learner(dls, metrics=[accuracy])
learn.fit_one_cycle(3)
a b c d e f label
0 2.0 32.0 15.0 15.0 3.6 0.8 0.0
1 3.0 32.0 9.0 15.0 4.5 0.7 0.0
2 2.0 32.0 9.0 14.0 3.9 0.5 1.0
3 2.0 12.0 6.0 -1.0 4.0 1.0 0.0
4 2.0 12.0 8.0 15.0 3.7 0.9 0.0
5 2.0 12.0 4.0 15.0 3.7 0.7 1.0
6 1.0 12.0 0.0 15.0 4.0 0.7 0.0
7 3.0 12.0 0.0 15.0 3.3 0.5 0.0
8 2.0 12.0 5.0 15.0 4.3 0.9 0.0
9 2.0 12.0 9.0 15.0 4.3 0.9 0.0
None
epoch train_loss valid_loss accuracy time
0 0.482110 0.418015 0.832246 00:09
1 0.349364 0.333562 0.852591 00:07
2 0.322106 0.323014 0.854511 00:07

Why did we get this error?

get_emb_sz??
Signature: get_emb_sz(to: 'Tabular | TabularPandas', sz_dict: 'dict' = None) -> 'list'
Source:   
def get_emb_sz(
    to:Tabular|TabularPandas, 
    sz_dict:dict=None # Dictionary of {'class_name' : size, ...} to override default `emb_sz_rule` 
) -> list: # List of embedding sizes for each category
    "Get embedding size for each cat_name in `Tabular` or `TabularPandas`, or populate embedding size manually using sz_dict"
    return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]
File:      /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py
Type:      function
from fastai.tabular.model import _one_emb_sz
_one_emb_sz??
Signature: _one_emb_sz(classes, n, sz_dict=None)
Source:   
def _one_emb_sz(classes, n, sz_dict=None):
    "Pick an embedding size for `n` depending on `classes` if not given in `sz_dict`."
    sz_dict = ifnone(sz_dict, {})
    n_cat = len(classes[n])
    sz = sz_dict.get(n, int(emb_sz_rule(n_cat)))  # rule of thumb
    return n_cat,sz
File:      /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastai/tabular/model.py
Type:      function

We see that the error is due to get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs) line. The get_emb_sz function tries to return [_one_emb_sz(to.classes, n, sz_dict) for n in to.cat_names]. We get error because our dataloaders has no classes attributes .

Here, classes attributes is what category do we have in each of our categorical columns. In simple_df below, we would declare aa column as categorical feature, with 3 separate classes [1, 2, 3]. The learner doesn’t know this because we did not specify to Categorify our categorical column when initializing our dataloaders.

simple_df = pd.DataFrame({'aa': [1, 2, 3, 1], 'bb':[1.1, 2.2, 3.3, 5.0], 'label':[1, 0, 1, 1]})
simple_df
aa bb label
0 1 1.1 1
1 2 2.2 0
2 3 3.3 1
3 1 5.0 1
# no classes attributes

TabularPandas(simple_df, 
              cat_names = ['aa'], 
              cont_names = ['bb'],
              y_names = ['label'],
              y_block = CategoryBlock(vocab=simple_df[y_names]), 
              splits = ([0,1,2], [3]),
             ).dataloaders(bs=64).classes
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[32], line 7
      1 TabularPandas(simple_df, 
      2               cat_names = ['aa'], 
      3               cont_names = ['bb'],
      4               y_names = ['label'],
      5               y_block = CategoryBlock(vocab=simple_df[y_names]), 
      6               splits = ([0,1,2], [3]),
----> 7              ).dataloaders(bs=64).classes

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k)
    505 if self._component_attr_filter(k):
    506     attr = getattr(self,self._default,None)
--> 507     if attr is not None: return getattr(attr,k)
    508 raise AttributeError(k)

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k)
    505 if self._component_attr_filter(k):
    506     attr = getattr(self,self._default,None)
--> 507     if attr is not None: return getattr(attr,k)
    508 raise AttributeError(k)

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/basics.py:507, in GetAttr.__getattr__(self, k)
    505 if self._component_attr_filter(k):
    506     attr = getattr(self,self._default,None)
--> 507     if attr is not None: return getattr(attr,k)
    508 raise AttributeError(k)

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:212, in Pipeline.__getattr__(self, k)
--> 212 def __getattr__(self,k): return gather_attrs(self, k, 'fs')

File /opt/homebrew/Caskroom/miniforge/base/envs/fastai/lib/python3.11/site-packages/fastcore/transform.py:173, in gather_attrs(o, k, nm)
    171 att = getattr(o,nm)
    172 res = [t for t in att.attrgot(k) if t is not None]
--> 173 if not res: raise AttributeError(k)
    174 return res[0] if len(res)==1 else L(res)

AttributeError: classes
# now we have classes attributes

TabularPandas(simple_df, 
              cat_names = ['aa'], 
              cont_names = ['bb'],
              y_names = ['label'],
              y_block = CategoryBlock(vocab=simple_df[y_names]), 
              splits = ([0,1,2], [3]),
              procs = [Categorify]
             ).dataloaders(bs=64).classes
{'aa': ['#na#', 1, 2, 3]}

That’s all for now, bye!