Git Product home page Git Product logo

chispa's People

Contributors

alexmooney avatar alexott avatar anaynayak avatar antondemeester avatar arunvelsriram avatar calum-mcg avatar chris-remedy avatar dependabot[bot] avatar ethnhll avatar lucas-lm avatar mitches-got-glitches avatar mrpowers avatar orcascope avatar ratsclub avatar semyonsinchenko avatar

Stargazers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar  avatar

Watchers

 avatar  avatar  avatar  avatar  avatar  avatar  avatar

chispa's Issues

Flatten dataframe

Hi, can we add a function to flatten a nested dataframe?

from pyspark.sql.functions import *
from pyspark.sql.types import *


def flatten_test(df, sep="_"):
    """Returns a flattened dataframe.
    .. versionadded:: x.X.X

    Parameters
    ----------
    sep : str
        Delimiter for flatted columns. Default `_`

    Notes
    -----
    Don`t use `.` as `sep`
    It won't work on nested data frames with more than one level.
    And you will have to use `columns.name`.

    Flattening Map Types will have to find every key in the column.
    This can be slow.

    Examples
    --------

    data_mixed = [
        {
            "state": "Florida",
            "shortname": "FL",
            "info": {"governor": "Rick Scott"},
            "counties": [
                {"name": "Dade", "population": 12345},
                {"name": "Broward", "population": 40000},
                {"name": "Palm Beach", "population": 60000},
            ],
        },
        {
            "state": "Ohio",
            "shortname": "OH",
            "info": {"governor": "John Kasich"},
            "counties": [
                {"name": "Summit", "population": 1234},
                {"name": "Cuyahoga", "population": 1337},
            ],
        },
    ]

    data_mixed = spark.createDataFrame(data=data_mixed)

    data_mixed.printSchema()

    root
    |-- counties: array (nullable = true)
    |    |-- element: map (containsNull = true)
    |    |    |-- key: string
    |    |    |-- value: string (valueContainsNull = true)
    |-- info: map (nullable = true)
    |    |-- key: string
    |    |-- value: string (valueContainsNull = true)
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)


    data_mixed_flat = flatten_test(df, sep=":")
    data_mixed_flat.printSchema()
    root
    |-- shortname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- counties:name: string (nullable = true)
    |-- counties:population: string (nullable = true)
    |-- info:governor: string (nullable = true)




    data = [
        {
            "id": 1,
            "name": "Cole Volk",
            "fitness": {"height": 130, "weight": 60},
        },
        {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
        {
            "id": 2,
            "name": "Faye Raker",
            "fitness": {"height": 130, "weight": 60},
        },
    ]


    df = spark.createDataFrame(data=data)

    df.printSchema()

    root
    |-- fitness: map (nullable = true)
    |    |-- key: string
    |    |-- value: long (valueContainsNull = true)
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)

    df_flat = flatten_test(df, sep=":")

    df_flat.printSchema()

    root
    |-- id: long (nullable = true)
    |-- name: string (nullable = true)
    |-- fitness:height: long (nullable = true)
    |-- fitness:weight: long (nullable = true)

    data_struct = [
            (("James",None,"Smith"),"OH","M"),
            (("Anna","Rose",""),"NY","F"),
            (("Julia","","Williams"),"OH","F"),
            (("Maria","Anne","Jones"),"NY","M"),
            (("Jen","Mary","Brown"),"NY","M"),
            (("Mike","Mary","Williams"),"OH","M")
            ]


    schema = StructType([
        StructField('name', StructType([
            StructField('firstname', StringType(), True),
            StructField('middlename', StringType(), True),
            StructField('lastname', StringType(), True)
            ])),
        StructField('state', StringType(), True),
        StructField('gender', StringType(), True)
        ])

    df_struct = spark.createDataFrame(data = data_struct, schema = schema)

    df_struct.printSchema()

    root
    |-- name: struct (nullable = true)
    |    |-- firstname: string (nullable = true)
    |    |-- middlename: string (nullable = true)
    |    |-- lastname: string (nullable = true)
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)

    df_struct_flat = flatten_test(df_struct, sep=":")

    df_struct_flat.printSchema()

    root
    |-- state: string (nullable = true)
    |-- gender: string (nullable = true)
    |-- name:firstname: string (nullable = true)
    |-- name:middlename: string (nullable = true)
    |-- name:lastname: string (nullable = true)
    """
    # compute Complex Fields (Arrays, Structs and Maptypes) in Schema
    complex_fields = dict(
        [
            (field.name, field.dataType)
            for field in df.schema.fields
            if type(field.dataType) == ArrayType
            or type(field.dataType) == StructType
            or type(field.dataType) == MapType
        ]
    )

    while len(complex_fields) != 0:
        col_name = list(complex_fields.keys())[0]
        # print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))

        # if StructType then convert all sub element to columns.
        # i.e. flatten structs
        if type(complex_fields[col_name]) == StructType:
            expanded = [
                col(col_name + "." + k).alias(col_name + sep + k)
                for k in [n.name for n in complex_fields[col_name]]
            ]
            df = df.select("*", *expanded).drop(col_name)

        # if ArrayType then add the Array Elements as Rows using the explode function
        # i.e. explode Arrays
        elif type(complex_fields[col_name]) == ArrayType:
            df = df.withColumn(col_name, explode_outer(col_name))

        # if MapType then convert all sub element to columns.
        # i.e. flatten
        elif type(complex_fields[col_name]) == MapType:
            keys_df = df.select(explode_outer(map_keys(col(col_name)))).distinct()
            keys = list(map(lambda row: row[0], keys_df.collect()))
            key_cols = list(
                map(
                    lambda f: col(col_name).getItem(f).alias(str(col_name + sep + f)),
                    keys,
                )
            )
            drop_column_list = [col_name]
            df = df.select(
                [
                    col_name
                    for col_name in df.columns
                    if col_name not in drop_column_list
                ]
                + key_cols
            )

        # recompute remaining Complex Fields in Schema
        complex_fields = dict(
            [
                (field.name, field.dataType)
                for field in df.schema.fields
                if type(field.dataType) == ArrayType
                or type(field.dataType) == StructType
                or type(field.dataType) == MapType
            ]
        )

    return df





Font colors in error messages are bad in some terminals

Screenshot 2023-08-09 at 10 25 06 AM

The white color font to underline cell differences isn't great in my terminal for example. It'd be totally unreadable in a terminal with a white background.

We should try to make two improvements:

  • Use default terminal colors for the given users (if that's possible), so it looks good in their terminal
  • Let users customize the colors

DataFramesNotEqualError when dataframes appear identical

I have two dataframes that appear identical but assert_approx_df_equality is throwing DataFramesNotEqual error. There may be an intermittent going on because this code passed on the development cluster but failed in the test pipeline. Also, changing the precision from 0.001 to 1.0 allows the test to pass, although I don't see any differences in the actual vs. expected output.

actual_df = ...create the dataframe with my component...

expected_data = [ 
        ('POINT (2.5 1.5)', 1.0, 1.0, 0.7071067811865476, 2.0, 2.0, False),
        ('POINT (2.55 2.25)', 2.0, 2.0, 0.14142135623730964, 2.65, 2.35, False),
        ('POINT (4.75 2.5)', 3.0, 3.0, 0.5, 5.25, 2.5, False),
        ('POINT EMPTY', 4.0, None, -999.0, float('nan'), float('nan'), False)
     ]
expected_df = (spark.createDataFrame(expected_data, ["wkt", "point_id", "poly_id", "distance", "X", "Y", "isOnRight"])).sort("point_id")

actual_df.show()
expected_df.show()

assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)

the output of the show commands:

+-----------------+--------+-------+-------------------+----+----+---------+
|              wkt|point_id|poly_id|           distance|   X|   Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
|  POINT (2.5 1.5)|     1.0|    1.0| 0.7071067811865476| 2.0| 2.0|    false|
|POINT (2.55 2.25)|     2.0|    2.0|0.14142135623730964|2.65|2.35|    false|
| POINT (4.75 2.5)|     3.0|    3.0|                0.5|5.25| 2.5|    false|
|      POINT EMPTY|     4.0|   null|             -999.0| NaN| NaN|    false|
+-----------------+--------+-------+-------------------+----+----+---------+

+-----------------+--------+-------+-------------------+----+----+---------+
|              wkt|point_id|poly_id|           distance|   X|   Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
|  POINT (2.5 1.5)|     1.0|    1.0| 0.7071067811865476| 2.0| 2.0|    false|
|POINT (2.55 2.25)|     2.0|    2.0|0.14142135623730964|2.65|2.35|    false|
| POINT (4.75 2.5)|     3.0|    3.0|                0.5|5.25| 2.5|    false|
|      POINT EMPTY|     4.0|   null|             -999.0| NaN| NaN|    false|
+-----------------+--------+-------+-------------------+----+----+---------+

The exception shows the last three rows are different though I can't spot the differences.

DataFramesNotEqualError                   Traceback (most recent call last)
<command-340851985589312> in <module>
     50 expected_df.show()
     51 
---> 52 assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)

/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_approx_df_equality(df1, df2, precision, ignore_nullable)
     38 def assert_approx_df_equality(df1, df2, precision, ignore_nullable=False):
     39     assert_schema_equality(df1.schema, df2.schema, ignore_nullable)
---> 40     assert_generic_rows_equality(df1, df2, are_rows_approx_equal, [precision])
     41 
     42 

/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_generic_rows_equality(df1, df2, row_equality_fun, row_equality_fun_args)
     62             t.add_row([r1, r2])
     63     if allRowsEqual == False:
---> 64         raise DataFramesNotEqualError("\n" + t.get_string())
     65 
     66 

DataFramesNotEqualError: 
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|                                                          df1                                                           |                                                          df2                                                           |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
|   Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=2.0, distance=0.7071067811865476, X=3.0, Y=2.0, isOnRight=False)    |   Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=1.0, distance=0.7071067811865476, X=2.0, Y=2.0, isOnRight=False)    |
| Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) | Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) |
|          Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False)          |          Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False)          |
|           Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False)           |           Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False)           |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+

and the two schemas compared:

root
 |-- wkt: string (nullable = true)
 |-- point_id: double (nullable = true)
 |-- poly_id: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- isOnRight: boolean (nullable = true)

root
 |-- wkt: string (nullable = true)
 |-- point_id: double (nullable = true)
 |-- poly_id: double (nullable = true)
 |-- distance: double (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)
 |-- isOnRight: boolean (nullable = true)

Output less code in error messages

The error messages currently output a lot of code.

Screen Shot 2021-03-26 at 8 22 37 AM

Want to figure out how to output the pretty DataFrame comparison without all the chispa code.

Allow to compare df ignoring the order

Hi there,

Thank you for the effort on this library. Perhaps it would be great to add an extra assert that would ignore the order in dataframe.

Some of the functions, for example df.dropDuplicates() are unpredictable in order. It would be great to have something that would just ignore the order, or order all record by a given column...

Thank you for any consideration!

Good luck!

Misleading coloring after failing assert

Hi, I'm using chispa for tests. I found that it paints some columns red in a report, even if they are not incorrect. Look at the example below. I have size_cd and retail_week columns reshufled, which is causing assertion error. But chispa also paints red all other columns which have different nullable option even though I'm using ignore nullable parameter.

I think it would be better to paint such columns blue. Right now it feels like chispa does not work as intended with ignore nullable parameter.

pic

Unit tests are only run against a single version of Python on the `main` branch.

I think it could be useful to run unit tests against multiple versions of Python on each PR, to gain extra confidence in the proposed changes of a PR. This also builds extra confidence that chispa actually works on the python versions as specified in pyproject.toml, and could prevent issues such as #78 in the future.

I have created a draft PR to solve this; #81

Use eqNullSafe instead of collect

Since Spark 2.3 there is the Pyspark function eqNullSafe, this seems a much better way to compare columns and also can be used to compare dataframes.

Advantages:

  • It comes form the main library hence no need to adjust Chispa if later on the library decides to change the way dataframes interact with collect
  • Solves the NaN and Null problem

For dataframe it would mean that there has to be some sort of loop over columns and then a reduce to check all member of the resulting column are true. I think it is worth the change due to the 2 reasons given above,

Possibly set row comparison to true by default for DataFrame comparisons

Maybe I'm the outlier, but I consider the more intuitive check -- especially for testing purposes -- to ignore order. If some function produces a DataFrame that I want to check, I care about the contents. And by default, Spark offers no guarantees on row order unless your plan has an explicit .orderBy(). So relying on the stability of row order in the absence of an explicit order by clause is a recipe for surprises, much like it is in SQL.

In fact, I don't think .collect() even provides any guarantees that the row order of the resulting array will match the row order of the original DataFrame---again, unless the DataFrame has an explicit ordering specified. It's theoretically possible, for example, that you could call spark.range(3).collect() twice and get different row orders each time. So if you're relying on .collect() to preserve order without explicit ordering on the original DataFrames, then I would say that's technically incorrect.

By the way, in your own usages of this library (or the Scala equivalent), how often do you compare DataFrames where you care about the row order? I'm curious to see a few examples of that.

Originally posted by @nchammas in #19 (comment)

Unit testing the code with Spark Connect

I created a pull request and confirmed that chispa is fully compliant wth Spark Connect: #86

We don't want to make chispa depend on Spark connect because then it'd have to depend on PySpark >= 3.5. We want chispa to support many old Spark versions for users that are still on legacy Spark runtimes.

It would be nice to test chispa+Spark Connect in the CI tho. We don't want to ever add any chispa code that wouldn't work with Spark Connect. We want all Spark Connnect users to always have access to chispa as well.

Any thoughts on how to add Spark Connect to the CI test suite?

Handle nested nullability

When using ignore_nullable=True chispa still sees differences in ArrayType because there's a nullable difference in the inner type:

StructField(my_arr_col,ArrayType(StringType,false),false)
StructField(my_arr_col,ArrayType(StringType,true),true)

Make it easier for conda devs to setup virtual env to work on this project

This project is setup for Poetry development.

Is there a way to setup this project for conda development as well? I want to keep using Poetry for my personal development and to build / release the wheel files.

Is there a way to structure the projects so ppl with conda installed on their machine can get a virtual environment properly setup? Conda is the other popular Python virtual env tool correct?

Add unit tests to highlight limitations of this library

Let's add some unit tests that highlight the limitations of this library.

Situations like DataFrames with nested schemas, nested arrays, NaN values, and any other weird edge case that we need to support.

This will hopefully encourage the open source community to fill the gaps.

SchemasNotEqualError not show more columns in one shcema

E chispa.dataframe_comparer.SchemasNotEqualError:
E +------------------------------------------+------------------------------------------+
E | schema1 | schema2 |
E +------------------------------------------+------------------------------------------+
E | StructField(second_name,StringType,true) | StructField(second_name,StringType,true) |
E | StructField(id,LongType,true) | StructField(id,LongType,true) |
E | StructField(floor,LongType,true) | StructField(floor,LongType,true) |
E | StructField(first_name,StringType,true) | StructField(first_name,StringType,true) |
E +------------------------------------------+------------------------------------------+
didn't show schema2 has one more column 'age'

Add support for NaN equality within Arrays

When trying to assert_df_equality with allow_nan_equality=True, if the both DataFrames hold an array that contains some nan values then the comparer fails, even if the nans are in the same place.

An additional check should take place here to compare the array elements.

if allow_nan_equality:
for key in d1.keys() & d2.keys():
if not(nan_safe_equality(d1[key], d2[key])):
return False
return True

Again, I can help contribute to this one but it won't be until next week.

Here's a comparison example:

image

Highlight mismatching values

assert_df_equality is very handy for catching regression in pyspark ETL functions.

But debugging is hard when there are many columns (e.g. more than 5-10 with long names). Pretty table row comparison output records spread over multiple lines. It becomes difficult to manually identify the discrepancies.

I'd like to request a feature which could make this easier: optional highlighting of diffs. When two rows don't match and are printed in red text, the mismatching values could also be highlighted e.g. with background color yellow.

Possible Solution

To highlight those values, rather than or in addition to testing for equality, I think we'd need to identify and return them for PrettyTable display to handle differently. E.g. collect column keys with mismatching values here and here, then search for their corresponding tokens and color them differently in the strings here and here.

Maybe try colorama for background highlighting.

Adapt "ignore_nullable" parameter to handle data types within Arrays

If the schema contains arrays, and the arrays contain types with different nullability, the "ignore_nullable" option doesn't work.

Consider changing to the following:

def are_schemas_equal_ignore_nullable(s1, s2):
    if len(s1) != len(s2):
        return False
    zipped = list(six.moves.zip_longest(s1, s2))
    for sf1, sf2 in zipped:
        if sf1.name != sf2.name or ~check_type_equal_ignore_nullable(sf1, sf2)
          return False
    return True

def check_type_equal_ignore_nullable(c1, c2):
    """Checks column types ignoring nullables."""
    if c1.typeName() == c2.typeName():
        # Account for array types by inspecting elementType.
        if c1.typeName() == 'array':
            return c1.elementType == c2.elementType
        else:
            return True
    else:
        return False

I can have a go at contributing if you're OK with it?

Mutable default arguments in Python (like `[]`) are slightly dangerous. If you mutate `transforms` inside this function, the mutations will persist to the next call of the function.

Mutable default arguments in Python (like []) are slightly dangerous. If you mutate transforms inside this function, the mutations will persist to the next call of the function.

The typical solution to this is to set the default to None and then inside the function convert to [].

Originally posted by @nchammas in #16 (comment)

Investigate "SPARK_TESTING" environment variable

When this variable is set, a lot of spark-internal things are disabled or simplified. It may significantly increase speed of testing. One may check this variable in spark code. We may set it up automatically before creating a spark session.

chispa 1.0 release

It would be nice to develop chispa so we can make a 1.0 release.

We might even want to expose a different interface. Something like this:

@dataclass
class MyFormats:
    mismatched_rows = ["light_yellow"]
    matched_rows = ["cyan", "bold"]
    mismatched_cells = ["purple"]
    matched_cells = ["blue"]

my_chispa = Chispa(formats=MyFormats())

my_chispa.assert_df_equality(actual_df, expected_df)

The user could inject the my_chispa object in their tests as follows:

@pytest.fixture()
def my_chispa():
    return Chispa(formats=MyFormats())

def test_shows_assert_basic_rows_equality(my_chispa):
  ...
  my_chispa.assert_basic_rows_equality(df1.collect(), df2.collect())

It's worth contemplating at least.

assert_df_equality throws SchemasNotEqualError when the dataframes are identical (except for the metadata)

I have a test where i define how the productive table will be created. I'm setting some comments to the columns so the user that consumes this table can understand what that column does. The problem is that when I make a test of that table with a custom dataframe, chispa throws me an exception due to schema mismatch.

Example:

spark.sql("""
CREATE TABLE IF NOT EXISTS foo (
    id LONG COMMENT "a comment",
    value INT
)
""")
spark.sql("INSERT INTO foo values (1,1)")

df = spark.table("foo")
schema = T.StructType([
    T.StructField("id", T.LongType(), True),
    T.StructField("value", T.IntegerType(), True),
])
expected = spark.createDataFrame(data=[(1, 1)], schema=schema)

assert_df_equality(df, expected)

The assertion fails for the schema, the output shows that value is identical (because it has no metadata) but the id is not equal (but it seems identical). If you remove the "COMMENT" section from the table creation, the test pass. Being forced to add the metadata in the struct type its way more tedious, is there a chance to ignore the metadata using a boolean (ignore_schema_metadata)?

underline_cells failing if dataframes are different lengths

When using the underline_cells flag in assert_df_equality if the dataframes have different amounts of rows, the assertion function throws an exception.

from decimal import Decimal

from pyspark.sql import SparkSession
from pyspark.sql import types as T

from chispa.dataframe_comparer import assert_df_equality

spark = SparkSession.builder.getOrCreate()

schema = T.StructType(
    [
        T.StructField("id", T.StringType(), nullable=False),
        T.StructField("balance", T.DecimalType(38,6), nullable=True),
    ]
)

df1 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
    ],
    schema=schema,
)
df2 = spark.createDataFrame(
    [
        [1, None],
        [2, Decimal(1.0)],
        [3, Decimal(100)],
    ],
    schema=schema,
)

This gives two dataframes, with different row counts:

df1.show()
+---+--------+
| id| balance|
+---+--------+
|  1|    null|
|  2|1.000000|
+---+--------+

df2.show()
+---+----------+
| id|   balance|
+---+----------+
|  1|      null|
|  2|  1.000000|
|  3|100.000000|
+---+----------+

When calling just assert_df_equality you get the expected comparison:

assert_df_equality(df1, df2)
---------------------------------------------------------------------------
DataFramesNotEqualError                   Traceback (most recent call last)
Cell In [16], line 1
----> 1 assert_df_equality(df1, df2)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:25, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     23         else:
     24             t.add_row([r1, r2])
---> 25 raise chispa.DataFramesNotEqualError("\n" + t.get_string())

DataFramesNotEqualError: 
+------------------------------------------+--------------------------------------------+
|                   df1                    |                    df2                     |
+------------------------------------------+--------------------------------------------+
|        Row(id='1', balance=None)         |         Row(id='1', balance=None)          |
| Row(id='2', balance=Decimal('1.000000')) |  Row(id='2', balance=Decimal('1.000000'))  |
|                   None                   | Row(id='3', balance=Decimal('100.000000')) |
+------------------------------------------+--------------------------------------------+

but when adding underline_cells you get an exception:

assert_df_equality(df1, df2, underline_cells=True)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In [17], line 1
----> 1 assert_df_equality(df1, df2, underline_cells=True)

File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
     24     assert_generic_rows_equality(
     25         df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
     26 else:
---> 27     assert_basic_rows_equality(
     28         df1.collect(), df2.collect(), underline_cells=underline_cells)

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:21, in assert_basic_rows_equality(rows1, rows2, underline_cells)
     19 else:
     20     if underline_cells:
---> 21         t.add_row(__underline_cells_in_row(
     22             r1=r1, r2=r2, row_column_names=row_column_names, num_columns=num_columns))
     23     else:
     24         t.add_row([r1, r2])

File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:73, in __underline_cells_in_row(r1, r2, row_column_names, num_columns)
     70 else:
     71     append_str = ", "
---> 73 if r1[column] != r2[column]:
     74     r1_string += underline_text(
     75         f"{column}='{r1[column]}'") + f"{append_str}"
     76     r2_string += underline_text(
     77         f"{column}='{r2[column]}'") + f"{append_str}"

TypeError: 'NoneType' object is not subscriptable

The it_does_not_throw_with_different_schema test exposes a bug

This test shouldn't be passing:

    def it_does_not_throw_with_different_schema():
        data1 = [(1.0, "jose"), (1.1, "li"), (1.2, "laura"), (None, None)]
        df1 = spark.createDataFrame(data1, ["num", "expected_name"])
        data2 = [("li", 1.05), ("laura", 1.2), (None, None), ("jose", 1.0)]
        df2 = spark.createDataFrame(data2, ["another_name", "same_num"])
        assert_approx_df_equality(df1, df2, 0.1, ignore_schema=True)

ignore_row_order=False isn't set, so this shouldn't be passing.

This is because of empty set returned in d1.keys() & d2.keys(), when the column names are different. The conditions are actually not checked at all and returning True.

assert None while ignore_metadata=True

Hello, I have an issue with the usage of ignore_metadata=True while comparing two schemas. Both of them have metadata defined specifically or by an empty {}. When ignore_metadata = False, it seems that the mismatches based on metadata are found. But when I set ignore_metadata=True, the error I get is:

AssertionError: assert None where None = assert_schema_equality(StructType([StructField('...))], StructType([StructField(...)], ignore_nullable=True, ignore_metadata=True).

Can you help me on that one? Thanks in advance!

Refactor code to conform to PEP8

Non-essential, would help to improve readability of code by ensuring the style conforms to PEP8 standards, including:

  • Indentation
  • Max line length
  • Naming conventions
  • Comments

pkg_resources is deprecated, prettytable produces warning

using assert_df_equality produces a warning in the console about pkg_resources being deprecated.

======================================================================================= warnings summary ========================================================================================
.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121
  /Users/ethnhll/Projects/dsc-publish/.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121: DeprecationWarning: pkg_resources is deprecated as an API
    warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning)

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
================================================================================= 1 passed, 1 warning in 11.76s =================================================================================

Looking at the code where pkg_resources is referenced, it seems like it is not currently in use.

import pkg_resources

I'm not familiar enough with the build processes at play for this project so I can't say for sure that removing the dependency on pkg_resources in prettytable.py won't have unintended side effects, but I can open a PR that removes it.

assert_df_equality fails when comparing nan values

Hello,

First of all, thank you for this great package! It's really making my test code cleaner.

One issue I've encountered is that when I perform assert_df_equality() on two dataframes where each contains a row with NaN values, the test fails.
image

I can see why it would fail as float("nan") == float("nan") evaluates to False. However, if I convert the dataframes to pandas df first, then use pd.testing.assert_frames_equal, the test passes. It would be awesome if assert_df_equality can handle NaN cases as well.

Thanks!

Give user control to customize output formatting

As noted in this pull request (#68), we want to give the user the ability to control the formatting of the output.

The formatting should be easy to configure for a given test and also easy to set globally for the entire test suite.

Here are the main concepts we want to model:

  • formatting for matched rows, unmatched rows, matched cells, unmatched cells
  • reprinting the DataFrame columns that don't match (for wide DataFrame comparisons). See this PR: #48.
  • displaying the diff DataFrame, see this PR: #35

The formatting should let the user configure color, underline, and bold.

These settings should be globally applicable to all the interfaces in the project including schema comparisons, DataFrame comparisons, StructField comparisons, and column comparisons.

Something like this could work:

{
  "mismatched_rows": ["red", "bold"],
  "matched_rows": "blue",
  "mismatched_cells": ["white", "underline"],
  "print_dif": True,
  "print_mismatched_cols": True
}

The user should be able to set this globally and then override for a given test (they should be able to partially override).

The user should also be able to ignore this entirely and just rely on the built-in defaults.

Hopefully we can make the outputs look good on both Mac and Windows machines.

Python 2.7 Compability

Dear team,

I am trying to integrate this library in my build process for unit test and got below-mentioned error during the build process. We are in the python 2.7 environments. Can you please help here.

Traceback (most recent call last):
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/loader.py", line 418, in loadTestsFromName
addr.filename, addr.module)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 47, in importFromPath
return self.importFromDir(dir_path, fqname)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 94, in importFromDir
mod = load_module(part_fqname, fh, filename, desc)
File "/var/lib/jenkins/workspace/myproject/test/test_functions.py", line 3, in
from chispa import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/init.py", line 1, in
from .dataframe_comparer import DataFramesNotEqualError, assert_df_equality, assert_approx_df_equality
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/dataframe_comparer.py", line 2, in
from chispa.bcolors import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/bcolors.py", line 28
def blue(s: str) -> str:

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.