Passing custom batch to the validator in Great Expectations

80 Views Asked by At
# Import necessary libraries
import great_expectations as ge
import datetime

# Load the Great Expectations context
context = ge.data_context.DataContext("../.")

# Load the JSON data into a Pandas DataFrame
data_file_path = "../../data/nested.json"
df = ge.read_json(data_file_path)

# Create a batch of data
batch = ge.dataset.PandasDataset(df)

# Create new columns for nested values
batch["details_age"] = batch["details"].apply(lambda x: x.get("age"))
batch["details_address_city"] = batch["details"].apply(lambda x: x.get("address").get("city"))
batch["details_address_state"] = batch["details"].apply(lambda x: x.get("address").get("state"))

# Load the expectation suite
expectation_suite_name = 'nestedjson_expectations_suite'
suite = context.get_expectation_suite(expectation_suite_name)

# Validate the batch against the expectation suite
results = context.run_validation_operator(
    "action_list_operator", 
    assets_to_validate=[batch],
    run_name = "abcd1",
    run_time = datetime.datetime.now(datetime.timezone.utc),
)

# Print the validation results
print(results)

context.build_data_docs()
context.open_data_docs(resource_identifier=results.list_validation_result_identifiers()[0])

I am trying to work with a nested json for which I need to flatten the json to be able to work with. In the code above as you can see that I have modified the batch but I'm unsure how to pass this batch to the validator alongside my expectation suite.

Looking at the code doc here run_validation_operator expects assets_to_validate that can either be a list of batches (which I am already trying)

 assets_to_validate: a list that specifies the data assets that the operator will validate. 
The members of the list can be either batches, or a tuple that will allow the operator to fetch 
the batch: (batch_kwargs, expectation_suite_name)

Where batch_kwargs is keyword arguments used to request a batch directly from a Datasource.

How am I suppose to pass custom batch alongside expectation suite to validate against?

Alternatively As an alternative I tested my batch without expectation suite like so:

# Import necessary libraries
import great_expectations as ge
import datetime

# Load the Great Expectations context
context = ge.data_context.DataContext("../.")

# Load the JSON data into a Pandas DataFrame
data_file_path = "../../data/nested.json"
df = ge.read_json(data_file_path)

# Create a batch of data
batch = ge.dataset.PandasDataset(df)

# Create new columns for nested values
batch["details_age"] = batch["details"].apply(lambda x: x.get("age"))
batch["details_address_city"] = batch["details"].apply(lambda x: x.get("address").get("city"))
batch["details_address_state"] = batch["details"].apply(lambda x: x.get("address").get("state"))

# Define expectations for the 'id' column
batch.expect_column_values_to_be_between('id', min_value=1, max_value=100)
batch.expect_column_values_to_be_unique('id')

# Define expectations for the 'name' column
batch.expect_column_values_to_match_regex('name', r'^[A-Za-z\s]+$')
batch.expect_column_values_to_not_be_null('name')

# Define expectations for nested fields
batch.expect_column_values_to_be_between('details_age', min_value=0, max_value=120)
batch.expect_column_values_to_match_regex('details_address_city', r'^[A-Za-z\s]+$')
batch.expect_column_values_to_match_regex('details_address_state', r'^[A-Za-z\s]+$')

# # Load the expectation suite
# expectation_suite_name = 'nestedjson_expectations_suite'
# suite = context.get_expectation_suite(expectation_suite_name)

# Validate the batch against the expectation suite
results = context.run_validation_operator(
    "action_list_operator", 
    assets_to_validate=[batch],
    run_name = "abcd1",
    run_time = datetime.datetime.now(datetime.timezone.utc),
)

# Print the validation results
print(results)

context.build_data_docs()
context.open_data_docs(resource_identifier=results.list_validation_result_identifiers()[0])
0

There are 0 best solutions below