Dealing with categorical predictors which aren't presented in trained decision tree classifier

70 Views Asked by At

I am creating my own DecisionTreeClassifier Python class for dataset with categorical predictors only. There is a dataset which is perfectly suits for the task. Here is my implementation of that classifier:

InformationEntropy - provides gain_ratio_X(pd.DataFrame, str) and remembers: predictors columns (and values), output class label (and values).

class InformationEntropy:
  def __init__(self, df, y_label):
    self.y_label = y_label
    self.y_classes = set(df[y_label].to_list())
    self.X_values = dict()
    for label in df.columns:
      if label != y_label:
        self.X_values[label] = set(df[label].to_list())

  def freq(self, df, C_j):
    return df.loc[df[y_label] == C_j].shape[0]

  def info(self, df):
    if df.shape[0] == 0:
      return 0

    result = 0
    for y_class in self.y_classes:
      freq_c_div_df = self.freq(df, y_class) / df.shape[0]
      if freq_c_div_df == 0:
        continue
      result -= freq_c_div_df * np.log2(freq_c_div_df)
    return result

  def info_X(self, df, X_label):
    if df.shape[0] == 0:
      return 0

    result = 0
    for attr in self.X_values[X_label]:
      df_i = df.loc[df[X_label] == attr]
      if df_i.shape[0] == 0:
        continue
      result += df_i.shape[0] * self.info(df_i)
    result /= df.shape[0]
    return result

  def split_info_X(self, df, X_label):
    result = 1e-9
    for attr in self.X_values[X_label]:
      df_i = df.loc[df[X_label] == attr]
      if df_i.shape[0] == 0:
        continue
      df_i_div_df = df_i.shape[0] / df.shape[0]
      result -= df_i_div_df * np.log2(df_i_div_df)
    return result

  def gain_ratio_X(self, df, X_label):
    return (self.info(df) - self.info_X(df, X_label)) / self.split_info_X(df, X_label)

DecisionTreeNode - provides predict and predict_proba methods, describes dataframe state at specific tree node.

class DecisionTreeNode:
  def __init__(self, parent_attribute = None, parent_attribute_value = None, attribute = None, entropy = 0.0, samples_count = 0):
    self.parent_attribute = parent_attribute
    self.parent_attribute_value = parent_attribute_value
    self.attribute = attribute
    self.entropy = entropy
    self.samples_count = samples_count
    self.samples = dict()
    self.probability = dict()
    self.prediction = None
    self.children = list()

  def predict(self, X):
    if len(self.children) == 0: # node is a leaf
      return self.prediction

    for child in self.children:
      if X[self.attribute] == child.parent_attribute_value:
        return child.predict(X)

  def predict_proba(self, X):
    if len(self.children) == 0: # node is a leaf
      return self.probability

    for child in self.children:
      if X[self.attribute] == child.parent_attribute_value:
        return child.predict_proba(X)

DecisionTree - classifier, builds decision tree by DecisionTreeNode elements, provides predict and predict_proba as root element DecisionTreeNode.predict and DecisionTreeNode.predict_proba.

class DecisionTree:
  def __init__(self, max_leaf_entropy = 0.0, max_leaf_samples = 1):
    assert (max_leaf_entropy > 0) or (max_leaf_samples > 0), "Entropy ratio and samples count to define leaf can't be 0 at once"

    self.decision_tree_node = None
    self.max_leaf_entropy = max_leaf_entropy
    self.max_leaf_samples = max_leaf_samples
    self.info_entropy = None

  def build_tree(self, df, TreeNode):
    if df.shape[0] == 0:
      return

    best_attr = None
    best_ratio = 0
    for attr in self.info_entropy.X_values:
      ratio = self.info_entropy.gain_ratio_X(df, attr)
      if best_ratio < ratio:
        best_attr = attr
        best_ratio = ratio

    TreeNode.attribute = best_attr
    TreeNode.entropy = best_ratio
    max_samples_count = 0
    for y_class in self.info_entropy.y_classes:
      TreeNode.samples[y_class] = df.loc[df[self.info_entropy.y_label] == y_class].shape[0]
      TreeNode.probability[y_class] = TreeNode.samples[y_class] / df.shape[0]
      if max_samples_count < TreeNode.samples[y_class]:
        max_samples_count = TreeNode.samples[y_class]
        TreeNode.prediction = y_class
    TreeNode.samples_count = df.shape[0]

    if (TreeNode.entropy > self.max_leaf_entropy) and (TreeNode.samples_count > self.max_leaf_samples):
      for attr in self.info_entropy.X_values[best_attr]:
        df_loc = df.loc[df[best_attr] == attr]
        if df_loc.shape[0] > 0:
          child = DecisionTreeNode()
          child.parent_attribute = best_attr
          child.parent_attribute_value = attr
          TreeNode.children.append(child)
          self.build_tree(df_loc, TreeNode.children[-1])

  def fit(self, df: pd.DataFrame, y_label: str):
    self.info_entropy = InformationEntropy(df, y_label) 
    self.decision_tree_node = DecisionTreeNode()
    self.build_tree(df, self.decision_tree_node)
    return self

  def predict(self, X_test):
    y_test = []
    for i in range(X_test.shape[0]):
      y_test.append(self.decision_tree_node.predict(X_test.iloc[i]))
    return y_test

  def predict_proba(self, X_test):
    y_test = []
    for i in range(X_test.shape[0]):
      y_test.append(self.decision_tree_node.predict_proba(X_test.iloc[i]))
    return y_test

There is an obvious problem in DecisionTreeNode.predict (and _proba) that frequently happens because of sklearn.model_selection.train_test_split when it choses train part that doesn't cover every value of all predictors. In that case,

for child in self.children:
  if X[self.attribute] == child.parent_attribute_value:
    return child.predict(X)

can't find such child and actually returns nothing.

I don't want to encode categorical options to additional columns, so I can't imagine any reasonable solution except this one:

*Changing DecisionTreeNode.predict (and _probe)*

def predict(self, X):
  for child in self.children:
    if X[self.attribute] == child.parent_attribute_value:
      return child.predict(X)
  
  # happens if node is a leaf or there's no suitable child
  return self.prediction

Now it returns inaccurate result which is better then return nothing, but it can happen at the very begining of the tree and prediction can be very irrelevant.

I am wondering if there is more proper solution.

0

There are 0 best solutions below