{
"little_mcar_test": {
"p_value": float,
"sample_size": int,
"n_columns_used": int,
"columns_used": list[str], # numeric columns with non-zero variance used in the test; constant columns and entirely-missing columns are excluded
},
"missing_value_pattern_frequencies": [
# Sorted by descending frequency_as_percentage; ties broken lexicographically by missing_columns.
{
"frequency_as_percentage": float,
"missing_columns": str, # column names joined by " & " in lexicographic order;
# empty string for complete rows
},
# ...
],
"missing_value_pattern_frequencies_top_10": [...], # same schema, first 10 entries
"conditional_missingness_matrix": [
# Not sorted. Use conditional_missingness_matrix_top_10 for the highest-rate entries.
{
"directional_column_pair": str, # "col_a -> col_b"
"reference_column": str,
"conditionally_missing_column": str,
"conditional_missing_rate": float, # P(col_b missing | col_a missing)
"n_reference_rows_used": int, # rows where col_a is missing
},
# ...
],
"conditional_missingness_matrix_top_10": [...], # same schema; sorted by descending conditional_missing_rate, ties broken lexicographically by (reference_column, conditionally_missing_column)
"missingness_correlation_matrix": [
# Not sorted. Each unordered pair stored twice (both orderings) so that filtering on
# column_1 alone finds every correlation involving that column.
# Use missingness_correlation_matrix_top_10 for the entries with the largest absolute value
# of the coefficient.
{
"column_pair": str, # "col_1 & col_2"
"column_1": str,
"column_2": str,
"missingness_correlation_coefficient": float,
},
# ...
],
"missingness_correlation_matrix_top_10": [
# Unique pairs only (stored once), sorted by descending absolute value of the coefficient; ties broken lexicographically by (column_1, column_2).
{
"column_pair": str,
"column_1": str,
"column_2": str,
"missingness_correlation_coefficient": float,
},
# ...
],
}