I want to create a primitive that would count the number of occurrences of the top n categories in a column
The following is my code:
import featuretools as ft
from featuretools.tests.testing_utils import make_ecommerce_entityset
from featuretools.primitives import TransformPrimitive, AggregationPrimitive
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Categorical
class value_count(AggregationPrimitive):
name = "value_count_top_n"
input_types = [ColumnSchema(logical_type = Categorical)]
return_types = ColumnSchema(semantic_tags = {"numeric"})
def __init__(self, n):
self.n = n
def get_function(self):
def value_counts_top_n(column):
vc = column.value_counts().sort_values(ascending = False).iloc[:self.n]
return list(vc)
return value_counts_top_n
es = make_ecommerce_entityset()
feature_matrix, features = ft.dfs(
entityset=es,
target_dataframe_name="sessions",
agg_primitives=[value_count(2)],
trans_primitives=[],
)
This will result in the following error:
TypeConversionError: Error converting datatype for VALUE_COUNT_TOP_N(log.product_id, n=1) from type object to type category. Please confirm the underlying data is consistent with logical type Categorical.
How can I implement the desired primitive that finds a categorical column's value_counts of top n occurrences?