I am creating my own DecisionTreeClassifier Python class for dataset with categorical predictors only. There is a dataset which is perfectly suits for the task. Here is my implementation of that classifier:
InformationEntropy - provides gain_ratio_X(pd.DataFrame, str) and remembers: predictors columns (and values), output class label (and values).
class InformationEntropy:
def __init__(self, df, y_label):
self.y_label = y_label
self.y_classes = set(df[y_label].to_list())
self.X_values = dict()
for label in df.columns:
if label != y_label:
self.X_values[label] = set(df[label].to_list())
def freq(self, df, C_j):
return df.loc[df[y_label] == C_j].shape[0]
def info(self, df):
if df.shape[0] == 0:
return 0
result = 0
for y_class in self.y_classes:
freq_c_div_df = self.freq(df, y_class) / df.shape[0]
if freq_c_div_df == 0:
continue
result -= freq_c_div_df * np.log2(freq_c_div_df)
return result
def info_X(self, df, X_label):
if df.shape[0] == 0:
return 0
result = 0
for attr in self.X_values[X_label]:
df_i = df.loc[df[X_label] == attr]
if df_i.shape[0] == 0:
continue
result += df_i.shape[0] * self.info(df_i)
result /= df.shape[0]
return result
def split_info_X(self, df, X_label):
result = 1e-9
for attr in self.X_values[X_label]:
df_i = df.loc[df[X_label] == attr]
if df_i.shape[0] == 0:
continue
df_i_div_df = df_i.shape[0] / df.shape[0]
result -= df_i_div_df * np.log2(df_i_div_df)
return result
def gain_ratio_X(self, df, X_label):
return (self.info(df) - self.info_X(df, X_label)) / self.split_info_X(df, X_label)
DecisionTreeNode - provides predict and predict_proba methods, describes dataframe state at specific tree node.
class DecisionTreeNode:
def __init__(self, parent_attribute = None, parent_attribute_value = None, attribute = None, entropy = 0.0, samples_count = 0):
self.parent_attribute = parent_attribute
self.parent_attribute_value = parent_attribute_value
self.attribute = attribute
self.entropy = entropy
self.samples_count = samples_count
self.samples = dict()
self.probability = dict()
self.prediction = None
self.children = list()
def predict(self, X):
if len(self.children) == 0: # node is a leaf
return self.prediction
for child in self.children:
if X[self.attribute] == child.parent_attribute_value:
return child.predict(X)
def predict_proba(self, X):
if len(self.children) == 0: # node is a leaf
return self.probability
for child in self.children:
if X[self.attribute] == child.parent_attribute_value:
return child.predict_proba(X)
DecisionTree - classifier, builds decision tree by DecisionTreeNode elements, provides predict and predict_proba as root element DecisionTreeNode.predict and DecisionTreeNode.predict_proba.
class DecisionTree:
def __init__(self, max_leaf_entropy = 0.0, max_leaf_samples = 1):
assert (max_leaf_entropy > 0) or (max_leaf_samples > 0), "Entropy ratio and samples count to define leaf can't be 0 at once"
self.decision_tree_node = None
self.max_leaf_entropy = max_leaf_entropy
self.max_leaf_samples = max_leaf_samples
self.info_entropy = None
def build_tree(self, df, TreeNode):
if df.shape[0] == 0:
return
best_attr = None
best_ratio = 0
for attr in self.info_entropy.X_values:
ratio = self.info_entropy.gain_ratio_X(df, attr)
if best_ratio < ratio:
best_attr = attr
best_ratio = ratio
TreeNode.attribute = best_attr
TreeNode.entropy = best_ratio
max_samples_count = 0
for y_class in self.info_entropy.y_classes:
TreeNode.samples[y_class] = df.loc[df[self.info_entropy.y_label] == y_class].shape[0]
TreeNode.probability[y_class] = TreeNode.samples[y_class] / df.shape[0]
if max_samples_count < TreeNode.samples[y_class]:
max_samples_count = TreeNode.samples[y_class]
TreeNode.prediction = y_class
TreeNode.samples_count = df.shape[0]
if (TreeNode.entropy > self.max_leaf_entropy) and (TreeNode.samples_count > self.max_leaf_samples):
for attr in self.info_entropy.X_values[best_attr]:
df_loc = df.loc[df[best_attr] == attr]
if df_loc.shape[0] > 0:
child = DecisionTreeNode()
child.parent_attribute = best_attr
child.parent_attribute_value = attr
TreeNode.children.append(child)
self.build_tree(df_loc, TreeNode.children[-1])
def fit(self, df: pd.DataFrame, y_label: str):
self.info_entropy = InformationEntropy(df, y_label)
self.decision_tree_node = DecisionTreeNode()
self.build_tree(df, self.decision_tree_node)
return self
def predict(self, X_test):
y_test = []
for i in range(X_test.shape[0]):
y_test.append(self.decision_tree_node.predict(X_test.iloc[i]))
return y_test
def predict_proba(self, X_test):
y_test = []
for i in range(X_test.shape[0]):
y_test.append(self.decision_tree_node.predict_proba(X_test.iloc[i]))
return y_test
There is an obvious problem in DecisionTreeNode.predict (and _proba) that frequently happens because of sklearn.model_selection.train_test_split when it choses train part that doesn't cover every value of all predictors. In that case,
for child in self.children:
if X[self.attribute] == child.parent_attribute_value:
return child.predict(X)
can't find such child and actually returns nothing.
I don't want to encode categorical options to additional columns, so I can't imagine any reasonable solution except this one:
*Changing DecisionTreeNode.predict (and _probe)*
def predict(self, X):
for child in self.children:
if X[self.attribute] == child.parent_attribute_value:
return child.predict(X)
# happens if node is a leaf or there's no suitable child
return self.prediction
Now it returns inaccurate result which is better then return nothing, but it can happen at the very begining of the tree and prediction can be very irrelevant.
I am wondering if there is more proper solution.