mrpowers / chispa Goto Github PK

View Code? Open in Web Editor NEW

510.0 510.0 62.0 5.63 MB

PySpark test helper methods with beautiful error messages

Home Page: https://mrpowers.github.io/chispa/

License: MIT License

Python 100.00%

pyspark testing

chispa's People

Contributors

Stargazers

Watchers

chispa's Issues

Trying to use Chispa in a new environment with Spark 2.4 and it installs Spark 3.0. Can this behaviour be made optional or does Chispa definitely need 3.0?

Flatten dataframe

Hi, can we add a function to flatten a nested dataframe?

from pyspark.sql.functions import *
from pyspark.sql.types import *


def flatten_test(df, sep="_"):
    """Returns a flattened dataframe.
    .. versionadded:: x.X.X

    Parameters
    ----------
    sep : str
        Delimiter for flatted columns. Default `_`

    Notes
    -----
    Don`t use `.` as `sep`
    It won't work on nested data frames with more than one level.
    And you will have to use `columns.name`.

    Flattening Map Types will have to find every key in the column.
    This can be slow.

    Examples
    --------

    data_mixed = [
        {
            "state": "Florida",
            "shortname": "FL",
            "info": {"governor": "Rick Scott"},
            "counties": [
                {"name": "Dade", "population": 12345},
                {"name": "Broward", "population": 40000},
                {"name": "Palm Beach", "population": 60000},
            ],
        },
        {
            "state": "Ohio",
            "shortname": "OH",
            "info": {"governor": "John Kasich"},
            "counties": [
                {"name": "Summit", "population": 1234},
                {"name": "Cuyahoga", "population": 1337},
            ],
        },
    ]

    data_mixed = spark.createDataFrame(data=data_mixed)

    data_mixed.printSchema()

    root
    |-- counties: array (nullable = true)
    |    |-- element: map (containsNull = true)
    |    |    |-- key: string
    |    |    |-- value: string (valueContainsNull = true)
    |-- info: map (nullable = true)
    |    |-- key: string
    |    |-- value: string (valueContainsNull = true)
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)


    data_mixed_flat = flatten_test(df, sep=":")
    data_mixed_flat.printSchema()
    root
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- counties:name: string (nullable = true)
    |-- counties:population: string (nullable = true)
    |-- info:governor: string (nullable = true)




    data = [
        {
            "id": 1,
            "name": "Cole Volk",
            "fitness": {"height": 130, "weight": 60},
        },
        {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
        {
            "id": 2,
            "name": "Faye Raker",
            "fitness": {"height": 130, "weight": 60},
        },
    ]


    df = spark.createDataFrame(data=data)

    df.printSchema()

    root
    |-- fitness: map (nullable = true)
    |    |-- key: string
    |    |-- value: long (valueContainsNull = true)
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)

    df_flat = flatten_test(df, sep=":")

    df_flat.printSchema()

    root
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)
    |-- fitness:height: long (nullable = true)
    |-- fitness:weight: long (nullable = true)

    data_struct = [
            (("James",None,"Smith"),"OH","M"),
            (("Anna","Rose",""),"NY","F"),
            (("Julia","","Williams"),"OH","F"),
            (("Maria","Anne","Jones"),"NY","M"),
            (("Jen","Mary","Brown"),"NY","M"),
            (("Mike","Mary","Williams"),"OH","M")
            ]


    schema = StructType([
        StructField('name', StructType([
            StructField('firstname', StringType(), True),
            StructField('middlename', StringType(), True),
            StructField('lastname', StringType(), True)
            ])),
        StructField('state', StringType(), True),
        StructField('gender', StringType(), True)
        ])

    df_struct = spark.createDataFrame(data = data_struct, schema = schema)

    df_struct.printSchema()

    root
    |-- name: struct (nullable = true)
    |    |-- firstname: string (nullable = true)
    |    |-- middlename: string (nullable = true)
    |    |-- lastname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)

    df_struct_flat = flatten_test(df_struct, sep=":")

    df_struct_flat.printSchema()

    root
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)
    |-- name:firstname: string (nullable = true)
    |-- name:middlename: string (nullable = true)
    |-- name:lastname: string (nullable = true)
    """
    # compute Complex Fields (Arrays, Structs and Maptypes) in Schema
    complex_fields = dict(
        [
            (field.name, field.dataType)
            for field in df.schema.fields
            if type(field.dataType) == ArrayType
            or type(field.dataType) == StructType
            or type(field.dataType) == MapType
        ]
    )

    while len(complex_fields) != 0:
        col_name = list(complex_fields.keys())[0]
        # print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))

        # if StructType then convert all sub element to columns.
        # i.e. flatten structs
        if type(complex_fields[col_name]) == StructType:
            expanded = [
                col(col_name + "." + k).alias(col_name + sep + k)
                for k in [n.name for n in complex_fields[col_name]]
            ]
            df = df.select("*", *expanded).drop(col_name)

        # if ArrayType then add the Array Elements as Rows using the explode function
        # i.e. explode Arrays
        elif type(complex_fields[col_name]) == ArrayType:
            df = df.withColumn(col_name, explode_outer(col_name))

        # if MapType then convert all sub element to columns.
        # i.e. flatten
        elif type(complex_fields[col_name]) == MapType:
            keys_df = df.select(explode_outer(map_keys(col(col_name)))).distinct()
            keys = list(map(lambda row: row[0], keys_df.collect()))
            key_cols = list(
                map(
                    lambda f: col(col_name).getItem(f).alias(str(col_name + sep + f)),
                    keys,
                )
            )
            drop_column_list = [col_name]
            df = df.select(
                [
                    col_name
                    for col_name in df.columns
                    if col_name not in drop_column_list
                ]
                + key_cols
            )

        # recompute remaining Complex Fields in Schema
        complex_fields = dict(
            [
                (field.name, field.dataType)
                for field in df.schema.fields
                if type(field.dataType) == ArrayType
                or type(field.dataType) == StructType
                or type(field.dataType) == MapType
            ]
        )

    return df

Implement ingore_nullability for structtypes

When there are structtypes, and set ingore types to True, it will still provide an error when the nullable fields are different.

Font colors in error messages are bad in some terminals

The white color font to underline cell differences isn't great in my terminal for example. It'd be totally unreadable in a terminal with a white background.

We should try to make two improvements:

Use default terminal colors for the given users (if that's possible), so it looks good in their terminal
Let users customize the colors

DataFramesNotEqualError when dataframes appear identical

I have two dataframes that appear identical but assert_approx_df_equality is throwing DataFramesNotEqual error. There may be an intermittent going on because this code passed on the development cluster but failed in the test pipeline. Also, changing the precision from 0.001 to 1.0 allows the test to pass, although I don't see any differences in the actual vs. expected output.

actual_df = ...create the dataframe with my component...

expected_data = [ 
        ('POINT (2.5 1.5)', 1.0, 1.0, 0.7071067811865476, 2.0, 2.0, False),
        ('POINT (2.55 2.25)', 2.0, 2.0, 0.14142135623730964, 2.65, 2.35, False),
        ('POINT (4.75 2.5)', 3.0, 3.0, 0.5, 5.25, 2.5, False),
        ('POINT EMPTY', 4.0, None, -999.0, float('nan'), float('nan'), False)
     ]
expected_df = (spark.createDataFrame(expected_data, ["wkt", "point_id", "poly_id", "distance", "X", "Y", "isOnRight"])).sort("point_id")

actual_df.show()
expected_df.show()

assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)

the output of the show commands:

+-----------------+--------+-------+-------------------+----+----+---------+
|              wkt|point_id|poly_id|           distance|   X|   Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
|  POINT (2.5 1.5)|     1.0|    1.0| 0.7071067811865476| 2.0| 2.0|    false|
|POINT (2.55 2.25)|     2.0|    2.0|0.14142135623730964|2.65|2.35|    false|
| POINT (4.75 2.5)|     3.0|    3.0|                0.5|5.25| 2.5|    false|
|      POINT EMPTY|     4.0|   null|             -999.0| NaN| NaN|    false|
+-----------------+--------+-------+-------------------+----+----+---------+

+-----------------+--------+-------+-------------------+----+----+---------+
|              wkt|point_id|poly_id|           distance|   X|   Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
|  POINT (2.5 1.5)|     1.0|    1.0| 0.7071067811865476| 2.0| 2.0|    false|
|POINT (2.55 2.25)|     2.0|    2.0|0.14142135623730964|2.65|2.35|    false|
| POINT (4.75 2.5)|     3.0|    3.0|                0.5|5.25| 2.5|    false|
|      POINT EMPTY|     4.0|   null|             -999.0| NaN| NaN|    false|
+-----------------+--------+-------+-------------------+----+----+---------+

The exception shows the last three rows are different though I can't spot the differences.

DataFramesNotEqualError                   Traceback (most recent call last)
<command-340851985589312> in <module>
     50 expected_df.show()
     51 
---> 52 assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)

/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_approx_df_equality(df1, df2, precision, ignore_nullable)
     38 def assert_approx_df_equality(df1, df2, precision, ignore_nullable=False):
     39     assert_schema_equality(df1.schema, df2.schema, ignore_nullable)
---> 40     assert_generic_rows_equality(df1, df2, are_rows_approx_equal, [precision])
     41 
     42 

/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_generic_rows_equality(df1, df2, row_equality_fun, row_equality_fun_args)
     62             t.add_row([r1, r2])
     63     if allRowsEqual == False:
---> 64         raise DataFramesNotEqualError("\n" + t.get_string())
     65 
     66 

DataFramesNotEqualError: 
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|                                                          df1                                                           |                                                          df2                                                           |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|   Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=2.0, distance=0.7071067811865476, X=3.0, Y=2.0, isOnRight=False)    |   Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=1.0, distance=0.7071067811865476, X=2.0, Y=2.0, isOnRight=False)    |
| Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) | Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) |
|          Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False)          |          Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False)          |
|           Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False)           |           Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False)           |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+

and the two schemas compared:

root
 |-- wkt: string (nullable = true)
 |-- point_id: double (nullable = true)
 |-- poly_id: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- isOnRight: boolean (nullable = true)

root
 |-- wkt: string (nullable = true)
 |-- point_id: double (nullable = true)
 |-- poly_id: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- isOnRight: boolean (nullable = true)

Output less code in error messages

The error messages currently output a lot of code.

Want to figure out how to output the pretty DataFrame comparison without all the chispa code.

Add parameter to enable DataFramesNotEqualError to be raised without printing the differences between the two dataframes

When calling the assert_df_equality and assert_approx_df_equality it will be good to have the option to not display the get_string(). Sometimes the output might be to long or truncated.I think this could be a good addition.

Example of the idea:

DataFramesNotEqualError("\n" + t.get_string()) if full_log else DataFramesNotEqualError

Where full_log is boolean

Allow to compare df ignoring the order

Hi there,

Thank you for the effort on this library. Perhaps it would be great to add an extra assert that would ignore the order in dataframe.

Some of the functions, for example df.dropDuplicates() are unpredictable in order. It would be great to have something that would just ignore the order, or order all record by a given column...

Thank you for any consideration!

Good luck!

SchemaNotEqualError not showing the difference in metadata

Related to #70 , when metadata is different between two schemas, the printout in the error logs don't explicitly show the difference in metadata since it only uses the __repr__ from pyspark's StructField.

Misleading coloring after failing assert

Hi, I'm using chispa for tests. I found that it paints some columns red in a report, even if they are not incorrect. Look at the example below. I have size_cd and retail_week columns reshufled, which is causing assertion error. But chispa also paints red all other columns which have different nullable option even though I'm using ignore nullable parameter.

I think it would be better to paint such columns blue. Right now it feels like chispa does not work as intended with ignore nullable parameter.

Fix the badges in the project README

The mack project has a bunch of nice badges:

We should add nice badges to this project README as well.

Unit tests are only run against a single version of Python on the `main` branch.

I think it could be useful to run unit tests against multiple versions of Python on each PR, to gain extra confidence in the proposed changes of a PR. This also builds extra confidence that chispa actually works on the python versions as specified in pyproject.toml, and could prevent issues such as #78 in the future.

I have created a draft PR to solve this; #81

Use eqNullSafe instead of collect

Since Spark 2.3 there is the Pyspark function eqNullSafe, this seems a much better way to compare columns and also can be used to compare dataframes.

Advantages:

It comes form the main library hence no need to adjust Chispa if later on the library decides to change the way dataframes interact with collect
Solves the NaN and Null problem

For dataframe it would mean that there has to be some sort of loop over columns and then a reduce to check all member of the resulting column are true. I think it is worth the change due to the 2 reasons given above,

Possibly set row comparison to true by default for DataFrame comparisons

Maybe I'm the outlier, but I consider the more intuitive check -- especially for testing purposes -- to ignore order. If some function produces a DataFrame that I want to check, I care about the contents. And by default, Spark offers no guarantees on row order unless your plan has an explicit .orderBy(). So relying on the stability of row order in the absence of an explicit order by clause is a recipe for surprises, much like it is in SQL.

In fact, I don't think .collect() even provides any guarantees that the row order of the resulting array will match the row order of the original DataFrame---again, unless the DataFrame has an explicit ordering specified. It's theoretically possible, for example, that you could call spark.range(3).collect() twice and get different row orders each time. So if you're relying on .collect() to preserve order without explicit ordering on the original DataFrames, then I would say that's technically incorrect.

By the way, in your own usages of this library (or the Scala equivalent), how often do you compare DataFrames where you care about the row order? I'm curious to see a few examples of that.

Originally posted by @nchammas in #19 (comment)

Remove the hard dependency on the pyspark

Right now, the chispa package has a hard dependency on the pyspark making it hard to use with Databricks runtime, or other compatible Spark runtime. Instead, this package should either rely on implicit dependency completely, or use something like findspark package, something like done in spark-testing-base or in pytest-spark

Unit testing the code with Spark Connect

I created a pull request and confirmed that chispa is fully compliant wth Spark Connect: #86

We don't want to make chispa depend on Spark connect because then it'd have to depend on PySpark >= 3.5. We want chispa to support many old Spark versions for users that are still on legacy Spark runtimes.

It would be nice to test chispa+Spark Connect in the CI tho. We don't want to ever add any chispa code that wouldn't work with Spark Connect. We want all Spark Connnect users to always have access to chispa as well.

Any thoughts on how to add Spark Connect to the CI test suite?

Handle nested nullability

When using ignore_nullable=True chispa still sees differences in ArrayType because there's a nullable difference in the inner type:

StructField(my_arr_col,ArrayType(StringType,false),false)
StructField(my_arr_col,ArrayType(StringType,true),true)

Dataframe equality ignoring schema

Sometimes we just need to check if two dataframes have equal values irrespective of their schemas. How about introducing an argument ignore_schema to skip schema check?

https://github.com/MrPowers/chispa/blob/main/chispa/dataframe_comparer.py#L23

Make it easier for conda devs to setup virtual env to work on this project

This project is setup for Poetry development.

Is there a way to setup this project for conda development as well? I want to keep using Poetry for my personal development and to build / release the wheel files.

Is there a way to structure the projects so ppl with conda installed on their machine can get a virtual environment properly setup? Conda is the other popular Python virtual env tool correct?

Run tests for multiple PySpark versions in CI

See here for an example: MrPowers/quinn#98

This is a good way we make sure we keep supporting legacy versions of Spark well for a long time.

We always want to support lots of Spark versions.

Add unit tests to highlight limitations of this library

Let's add some unit tests that highlight the limitations of this library.

Situations like DataFrames with nested schemas, nested arrays, NaN values, and any other weird edge case that we need to support.

This will hopefully encourage the open source community to fill the gaps.

`ignore_column_order` param for `assert_approx_df_equality` function

It would be great if we could avoid column order checking when using assert_approx_df_equality

SchemasNotEqualError not show more columns in one shcema

E chispa.dataframe_comparer.SchemasNotEqualError:
E +------------------------------------------+------------------------------------------+
E | schema1 | schema2 |
E +------------------------------------------+------------------------------------------+
E | StructField(second_name,StringType,true) | StructField(second_name,StringType,true) |
E | StructField(id,LongType,true) | StructField(id,LongType,true) |
E | StructField(floor,LongType,true) | StructField(floor,LongType,true) |
E | StructField(first_name,StringType,true) | StructField(first_name,StringType,true) |
E +------------------------------------------+------------------------------------------+
didn't show schema2 has one more column 'age'

Issues on Python 3.10 due to six 1.15.0

importing chispa can cause issues on Python 3.10 due to the use of six 1.15.0. See also secdev/scapy#3502 and this blogpost. As the blogpost patches, the issue is patched in six 1.16.0.

Add support for NaN equality within Arrays

When trying to assert_df_equality with allow_nan_equality=True, if the both DataFrames hold an array that contains some nan values then the comparer fails, even if the nans are in the same place.

An additional check should take place here to compare the array elements.

chispa/chispa/row_comparer.py

Lines 16 to 20 in 500793e

 if allow_nan_equality: 

 for key in d1.keys() & d2.keys(): 

 if not(nan_safe_equality(d1[key], d2[key])): 

 return False 

 return True

Again, I can help contribute to this one but it won't be until next week.

Here's a comparison example:

Highlight mismatching values

assert_df_equality is very handy for catching regression in pyspark ETL functions.

But debugging is hard when there are many columns (e.g. more than 5-10 with long names). Pretty table row comparison output records spread over multiple lines. It becomes difficult to manually identify the discrepancies.

I'd like to request a feature which could make this easier: optional highlighting of diffs. When two rows don't match and are printed in red text, the mismatching values could also be highlighted e.g. with background color yellow.

Possible Solution

To highlight those values, rather than or in addition to testing for equality, I think we'd need to identify and return them for PrettyTable display to handle differently. E.g. collect column keys with mismatching values here and here, then search for their corresponding tokens and color them differently in the strings here and here.

Maybe try colorama for background highlighting.

SchemaNotEqual error is unreadable for wide schemas

Adapt "ignore_nullable" parameter to handle data types within Arrays

If the schema contains arrays, and the arrays contain types with different nullability, the "ignore_nullable" option doesn't work.

Consider changing to the following:

def are_schemas_equal_ignore_nullable(s1, s2):
    if len(s1) != len(s2):
        return False
    zipped = list(six.moves.zip_longest(s1, s2))
    for sf1, sf2 in zipped:
        if sf1.name != sf2.name or ~check_type_equal_ignore_nullable(sf1, sf2)
          return False
    return True

def check_type_equal_ignore_nullable(c1, c2):
    """Checks column types ignoring nullables."""
    if c1.typeName() == c2.typeName():
        # Account for array types by inspecting elementType.
        if c1.typeName() == 'array':
            return c1.elementType == c2.elementType
        else:
            return True
    else:
        return False

I can have a go at contributing if you're OK with it?

Mutable default arguments in Python (like `[]`) are slightly dangerous. If you mutate `transforms` inside this function, the mutations will persist to the next call of the function.

Mutable default arguments in Python (like []) are slightly dangerous. If you mutate transforms inside this function, the mutations will persist to the next call of the function.

The typical solution to this is to set the default to None and then inside the function convert to [].

Originally posted by @nchammas in #16 (comment)

Add ignore_nullable option to assert_df_equality

Sometimes want to perform DataFrame equality comparisons without considering the nullable flag. Might as well add options to ignore column names and perform an unordered comparison, similar to spark-fast-tests.

Replace poetry dev-dependencies with dependency groups

See the change that was made in quinn: MrPowers/quinn#129

Investigate "SPARK_TESTING" environment variable

When this variable is set, a lot of spark-internal things are disabled or simplified. It may significantly increase speed of testing. One may check this variable in spark code. We may set it up automatically before creating a spark session.

ignore_schema in assert_df_equality removed in 9.3?

I used this parameter in 9.2 but it's no longer there in 9.3. Why was this removed and does it mean I can't perform unit-tests without comparing types any longer?

Implement all the DataFrame comparison options for approx comparisons as well

ignore_nullable
allow_nan_equality
ignore_column_order
ignore_row_order

chispa 1.0 release

It would be nice to develop chispa so we can make a 1.0 release.

We might even want to expose a different interface. Something like this:

@dataclass
class MyFormats:
    mismatched_rows = ["light_yellow"]
    matched_rows = ["cyan", "bold"]
    mismatched_cells = ["purple"]
    matched_cells = ["blue"]

my_chispa = Chispa(formats=MyFormats())

my_chispa.assert_df_equality(actual_df, expected_df)

The user could inject the my_chispa object in their tests as follows:

@pytest.fixture()
def my_chispa():
    return Chispa(formats=MyFormats())

def test_shows_assert_basic_rows_equality(my_chispa):
  ...
  my_chispa.assert_basic_rows_equality(df1.collect(), df2.collect())

It's worth contemplating at least.

assert_df_equality throws SchemasNotEqualError when the dataframes are identical (except for the metadata)

I have a test where i define how the productive table will be created. I'm setting some comments to the columns so the user that consumes this table can understand what that column does. The problem is that when I make a test of that table with a custom dataframe, chispa throws me an exception due to schema mismatch.

Example:

spark.sql("""
CREATE TABLE IF NOT EXISTS foo (
    id LONG COMMENT "a comment",
    value INT
)
""")
spark.sql("INSERT INTO foo values (1,1)")

df = spark.table("foo")
schema = T.StructType([
    T.StructField("id", T.LongType(), True),
    T.StructField("value", T.IntegerType(), True),
])
expected = spark.createDataFrame(data=[(1, 1)], schema=schema)

assert_df_equality(df, expected)

The assertion fails for the schema, the output shows that value is identical (because it has no metadata) but the id is not equal (but it seems identical). If you remove the "COMMENT" section from the table creation, the test pass. Being forced to add the metadata in the struct type its way more tedious, is there a chance to ignore the metadata using a boolean (ignore_schema_metadata)?

Add allow_nan_equality option to assert_approx_df_equality

I feel like this would be quite useful. Were there any design choices for why it wasn't included or would this be a useful addition?

chispa/chispa/dataframe_comparer.py

Lines 38 to 40 in 500793e

 def assert_approx_df_equality(df1, df2, precision, ignore_nullable=False): 

 assert_schema_equality(df1.schema, df2.schema, ignore_nullable) 

 assert_generic_rows_equality(df1, df2, are_rows_approx_equal, [precision])

underline_cells failing if dataframes are different lengths

When using the underline_cells flag in assert_df_equality if the dataframes have different amounts of rows, the assertion function throws an exception.

from decimal import Decimal

from pyspark.sql import SparkSession
from pyspark.sql import types as T

from chispa.dataframe_comparer import assert_df_equality

spark = SparkSession.builder.getOrCreate()

schema = T.StructType(
    [
        T.StructField("id", T.StringType(), nullable=False),
        T.StructField("balance", T.DecimalType(38,6), nullable=True),
    ]
)

df1 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
    ],
    schema=schema,
)
df2 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
        [3, Decimal(100)],
    ],
    schema=schema,
)

This gives two dataframes, with different row counts:

df1.show()
+---+--------+
| id| balance|
+---+--------+
|  1|    null|
|  2|1.000000|
+---+--------+

df2.show()
+---+----------+
| id|   balance|
+---+----------+
|  1|      null|
|  2|  1.000000|
|  3|100.000000|
+---+----------+

When calling just assert_df_equality you get the expected comparison:

assert_df_equality(df1, df2)
---------------------------------------------------------------------------
DataFramesNotEqualError                   Traceback (most recent call last)
Cell In [16], line 1
----> 1 assert_df_equality(df1, df2)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:25, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     23         else:
     24             t.add_row([r1, r2])
---> 25 raise chispa.DataFramesNotEqualError("\n" + t.get_string())

DataFramesNotEqualError: 
+------------------------------------------+--------------------------------------------+
|                   df1                    |                    df2                     |
+------------------------------------------+--------------------------------------------+
|        Row(id='1', balance=None)         |         Row(id='1', balance=None)          |
| Row(id='2', balance=Decimal('1.000000')) |  Row(id='2', balance=Decimal('1.000000'))  |
|                   None                   | Row(id='3', balance=Decimal('100.000000')) |
+------------------------------------------+--------------------------------------------+

but when adding underline_cells you get an exception:

assert_df_equality(df1, df2, underline_cells=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [17], line 1
----> 1 assert_df_equality(df1, df2, underline_cells=True)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:21, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     19 else:
     20     if underline_cells:
---> 21         t.add_row(__underline_cells_in_row(
     22             r1=r1, r2=r2, row_column_names=row_column_names, num_columns=num_columns))
     23     else:
     24         t.add_row([r1, r2])

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:73, in __underline_cells_in_row(r1, r2, row_column_names, num_columns)
     70 else:
     71     append_str = ", "
---> 73 if r1[column] != r2[column]:
     74     r1_string += underline_text(
     75         f"{column}='{r1[column]}'") + f"{append_str}"
     76     r2_string += underline_text(
     77         f"{column}='{r2[column]}'") + f"{append_str}"

TypeError: 'NoneType' object is not subscriptable

The it_does_not_throw_with_different_schema test exposes a bug

This test shouldn't be passing:

    def it_does_not_throw_with_different_schema():
        data1 = [(1.0, "jose"), (1.1, "li"), (1.2, "laura"), (None, None)]
        df1 = spark.createDataFrame(data1, ["num", "expected_name"])
        data2 = [("li", 1.05), ("laura", 1.2), (None, None), ("jose", 1.0)]
        df2 = spark.createDataFrame(data2, ["another_name", "same_num"])
        assert_approx_df_equality(df1, df2, 0.1, ignore_schema=True)

ignore_row_order=False isn't set, so this shouldn't be passing.

This is because of empty set returned in d1.keys() & d2.keys(), when the column names are different. The conditions are actually not checked at all and returning True.

assert None while ignore_metadata=True

Hello, I have an issue with the usage of ignore_metadata=True while comparing two schemas. Both of them have metadata defined specifically or by an empty {}. When ignore_metadata = False, it seems that the mismatches based on metadata are found. But when I set ignore_metadata=True, the error I get is:

AssertionError: assert None where None = assert_schema_equality(StructType([StructField('...))], StructType([StructField(...)], ignore_nullable=True, ignore_metadata=True).

Can you help me on that one? Thanks in advance!

Refactor code to conform to PEP8

Non-essential, would help to improve readability of code by ensuring the style conforms to PEP8 standards, including:

Indentation
Max line length
Naming conventions
Comments

Make proper StructField comparer and DataType comparer abstractions

These comparers should be abstracted to separate files and fully unit tested.

DataType compares are more complex than they seem at first blush. There are a bunch of nested options that deserve full unit testing.

pkg_resources is deprecated, prettytable produces warning

using assert_df_equality produces a warning in the console about pkg_resources being deprecated.

======================================================================================= warnings summary ========================================================================================
.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121
  /Users/ethnhll/Projects/dsc-publish/.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121: DeprecationWarning: pkg_resources is deprecated as an API
    warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning)

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
================================================================================= 1 passed, 1 warning in 11.76s =================================================================================

Looking at the code where pkg_resources is referenced, it seems like it is not currently in use.

chispa/chispa/prettytable.py

Line 44 in 2f1de26

import pkg_resources

I'm not familiar enough with the build processes at play for this project so I can't say for sure that removing the dependency on pkg_resources in prettytable.py won't have unintended side effects, but I can open a PR that removes it.

assert_df_equality fails when comparing nan values

Hello,

First of all, thank you for this great package! It's really making my test code cleaner.

One issue I've encountered is that when I perform assert_df_equality() on two dataframes where each contains a row with NaN values, the test fails.

I can see why it would fail as float("nan") == float("nan") evaluates to False. However, if I convert the dataframes to pandas df first, then use pd.testing.assert_frames_equal, the test passes. It would be awesome if assert_df_equality can handle NaN cases as well.

Thanks!

Give user control to customize output formatting

As noted in this pull request (#68), we want to give the user the ability to control the formatting of the output.

The formatting should be easy to configure for a given test and also easy to set globally for the entire test suite.

Here are the main concepts we want to model:

formatting for matched rows, unmatched rows, matched cells, unmatched cells
reprinting the DataFrame columns that don't match (for wide DataFrame comparisons). See this PR: #48.
displaying the diff DataFrame, see this PR: #35

The formatting should let the user configure color, underline, and bold.

These settings should be globally applicable to all the interfaces in the project including schema comparisons, DataFrame comparisons, StructField comparisons, and column comparisons.

Something like this could work:

{
  "mismatched_rows": ["red", "bold"],
  "matched_rows": "blue",
  "mismatched_cells": ["white", "underline"],
  "print_dif": True,
  "print_mismatched_cols": True
}

The user should be able to set this globally and then override for a given test (they should be able to partially override).

The user should also be able to ignore this entirely and just rely on the built-in defaults.

Hopefully we can make the outputs look good on both Mac and Windows machines.

Restructure assert_df_equality so that `if allow_nan_equality` is checked on specific row comparisons

Is there a way you could restructure this so that if allow_nan_equality is checked on specific row comparisons? It would cut down on some code duplication.

Originally posted by @nchammas in #16 (comment)

Python 2.7 Compability

Dear team,

I am trying to integrate this library in my build process for unit test and got below-mentioned error during the build process. We are in the python 2.7 environments. Can you please help here.

Traceback (most recent call last):
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/loader.py", line 418, in loadTestsFromName
addr.filename, addr.module)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 47, in importFromPath
return self.importFromDir(dir_path, fqname)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 94, in importFromDir
mod = load_module(part_fqname, fh, filename, desc)
File "/var/lib/jenkins/workspace/myproject/test/test_functions.py", line 3, in
from chispa import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/init.py", line 1, in
from .dataframe_comparer import DataFramesNotEqualError, assert_df_equality, assert_approx_df_equality
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/dataframe_comparer.py", line 2, in
from chispa.bcolors import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/bcolors.py", line 28
def blue(s: str) -> str:

Reconfigure CI to run tests on pull requests

Reconfigure the CI to run the test suite on pull requests.

This will make pull request review easier.

	if allow_nan_equality:
	for key in d1.keys() & d2.keys():
	if not(nan_safe_equality(d1[key], d2[key])):
	return False
	return True

	def assert_approx_df_equality(df1, df2, precision, ignore_nullable=False):
	assert_schema_equality(df1.schema, df2.schema, ignore_nullable)
	assert_generic_rows_equality(df1, df2, are_rows_approx_equal, [precision])

mrpowers / chispa Goto Github PK

chispa's People

Contributors

Stargazers

Watchers

Forkers

chispa's Issues

Possible Solution

Recommend Projects

Recommend Topics

Recommend Org