Is there a good way to add semantic enrichment to Enums using pydantic?

129 Views Asked by At

I want to add semantic enrichment to the JSON schema generated by pydantic BaseModels. The problem occurs when I want an element of the model to be an enumerated type. I can't figure out the correct way to add semantic enrichment to the new Enum or the values specified in the Enum's definition. Below is code I hacked. It generates the output I am looking for but it definitely isn't the correct solution. Please offer a better solution or let me know if there is simply a better way to document my data objects with semantic information.

I am using Python 3.11.3 and the following packages:

  • annotated-types==0.6.0
  • packaging==23.2
  • pydantic==2.5.2
  • pydantic_core==2.14.5
  • typing_extensions==4.8.0
    from enum import Enum
    import json
    from typing import Dict, Any

    from pydantic import BaseModel, Field

    def clean_dictionary(base: dict):
        """cleans the taxonomy out of the JSON"""

        keys = list(base.keys())
        # recursion exit
        if len(keys) == 2 and "value" in keys and "taxonomy" in keys:
            return base["value"]

        for key in keys:
            item = base[key]
            if isinstance(item, dict):
                base[key] = clean_dictionary(item)
            elif isinstance(item, list):
                for ii, sub in enumerate(item):
                    if isinstance(sub, dict):
                        item[ii] = clean_dictionary(sub)

        return base


    class OntologyModel(BaseModel):
        """A model sub-class that cleans the Enums when it generates JSON"""

        def model_dump(
            self,
            *,
            mode: str = "python",
            include=None,
            exclude=None,
            by_alias: bool = False,
            exclude_unset: bool = False,
            exclude_defaults: bool = False,
            exclude_none: bool = False,
            round_trip: bool = False,
            warnings: bool = True
        ) -> dict[str, Any]:
            """override of BaseModel method"""
            text = self.model_dump_json(
                include=include,
                exclude=exclude,
                by_alias=by_alias,
                exclude_unset=exclude_unset,
                exclude_defaults=exclude_defaults,
                exclude_none=exclude_none,
                round_trip=round_trip,
                warnings=warnings,
            )
            return json.loads(text)

        def model_dump_json(
            self,
            indent: int | None = None,
            include=None,
            exclude=None,
            by_alias: bool = False,
            exclude_unset: bool = False,
            exclude_defaults: bool = False,
            exclude_none: bool = False,
            round_trip: bool = False,
            warnings: bool = True,
        ):
            """override of BaseModel method"""
            data = json.loads(
                super().model_dump_json(
                    indent=indent,
                    include=include,
                    exclude=exclude,
                    by_alias=by_alias,
                    exclude_unset=exclude_unset,
                    exclude_defaults=exclude_defaults,
                    exclude_none=exclude_none,
                    round_trip=round_trip,
                    warnings=warnings,
                )
            )

            data = clean_dictionary(data)
            return json.dumps(data, indent=indent)


    class FlowerEnum(Enum):
        """taxonomy: //example.com/flowers/F000021"""

        DAN = {"value": "dandelion", "taxonomy": "//example.com/flowers#D00012"}
        ORC = {"value": "ochid", "taxonomy": "//example.com/flowers#O00032"}


    class ColorEnum(Enum):
        """taxonomy: https://example.com/colors/C000000"""

        RED = {"value": "red", "taxonomy": "//example.com/colors#C000001"}
        PUR = {"value": "purple", "taxonomy": "//example.com/colors#C000002"}


    class Flower(OntologyModel):
        """An instance of a specific flower"""

        class Config:
            json_schema_extra = {"taxonomy": "//example.com/flowers#F000003"}

        variety: FlowerEnum = Field(
            ...,
            description="The type of flower",
            json_schema_extra={"taxonomy": "//example.com/flowers#F000004"},
        )
        color: ColorEnum = Field(
            ...,
            description="The flower's color",
            json_schema_extra={"taxonomy": "//example.com/colors#C000005"},
        )


    if __name__ == "__main__":
        from pprint import pprint

        flower = Flower(variety=FlowerEnum.ORC, color=ColorEnum.PUR)
        print("\n", "*" * 80, "\n")
        pprint(flower.model_json_schema())
        print("\n", "*" * 80, "\n")
        pprint(flower.model_dump())
        print("\n", "*" * 80, "\n")

The code generates something semi-suitable for my purposes but I would rather create schemas that are more in line with best practices and don't have the goofy hidden requirement that the Enum values be specified in this peculiar way. Below is the output of the code as written:


{'$defs': {'ColorEnum': {'description': 'taxonomy: /example.com/colors/C000000',
                         'enum': [{'taxonomy': '/example.com/colors#C000001',
                                   'value': 'red'},
                                  {'taxonomy': '/example.com/colors#C000002',
                                   'value': 'purple'}],
                         'title': 'ColorEnum'},
           'FlowerEnum': {'description': 'taxonomy: '
                                         '/example.com/flowers/F000021',
                          'enum': [{'taxonomy': '/example.com/flowers#D00012',
                                    'value': 'dandelion'},
                                   {'taxonomy': '/example.com/flowers#O00032',
                                    'value': 'ochid'}],
                          'title': 'FlowerEnum'}},
 'description': 'An instance of a specific flower',
 'properties': {'color': {'allOf': [{'$ref': '#/$defs/ColorEnum'}],
                          'description': "The flower's color",
                          'taxonomy': '/example.com/colors#C000005'},
                'variety': {'allOf': [{'$ref': '#/$defs/FlowerEnum'}],
                            'description': 'The type of flower',
                            'taxonomy': '/example.com/flowers#F000004'}},
 'required': ['variety', 'color'],
 'taxonomy': '/example.com/flowers#F000003',
 'title': 'Flower',
 'type': 'object'}

 ******************************************************************************** 

{'color': 'purple', 'variety': 'ochid'}

 ********************************************************************************
1

There are 1 best solutions below

2
Hardik On
  1. Use Field's schema argument to directly specify the desired JSON schema for enum values:

    from enum import Enum,
    
    from pydantic import BaseModel, Field
    
    class FlowerEnum(str, Enum):
        DANDELLION = "dandelion"
        ORCHID = "orchid"
    
    class ColorEnum(str, Enum):
        RED = "red"
        PURPLE = "purple"
    
    class Flower(BaseModel):
        variety: FlowerEnum = Field(..., schema={
            "title": "FlowerEnum",
            "description": "taxonomy: /example.com/flowers/F000021",
            "enum": [
                {"value": "dandelion", "taxonomy": "//example.com/flowers#D00012"},
                {"value": "orchid", "taxonomy": "//example.com/flowers#O00032"}
            ]
        })
        color: ColorEnum = Field(..., schema={
            "title": "ColorEnum",
            "description": "taxonomy: /example.com/colors/C000000",
            "enum": [
                {"value": "red", "taxonomy": "//example.com/colors#C000001"},
                {"value": "purple", "taxonomy": "//example.com/colors#C000002"}
            ]
        })
    
  2. Simplify Enums:

    Use str as a base for enums to avoid custom JSON serialization.

  3. Customize JSON Schema Overall:

    Use Config.schema_extra to add global schema metadata:

     class Flower(BaseModel):
         # ...
    
         class Config:
             schema_extra = {"taxonomy": "//example.com/flowers#F000003"}