mrpowers / chispa Goto Github PK
View Code? Open in Web Editor NEWPySpark test helper methods with beautiful error messages
Home Page: https://mrpowers.github.io/chispa/
License: MIT License
PySpark test helper methods with beautiful error messages
Home Page: https://mrpowers.github.io/chispa/
License: MIT License
Hi, can we add a function to flatten a nested dataframe?
from pyspark.sql.functions import *
from pyspark.sql.types import *
def flatten_test(df, sep="_"):
"""Returns a flattened dataframe.
.. versionadded:: x.X.X
Parameters
----------
sep : str
Delimiter for flatted columns. Default `_`
Notes
-----
Don`t use `.` as `sep`
It won't work on nested data frames with more than one level.
And you will have to use `columns.name`.
Flattening Map Types will have to find every key in the column.
This can be slow.
Examples
--------
data_mixed = [
{
"state": "Florida",
"shortname": "FL",
"info": {"governor": "Rick Scott"},
"counties": [
{"name": "Dade", "population": 12345},
{"name": "Broward", "population": 40000},
{"name": "Palm Beach", "population": 60000},
],
},
{
"state": "Ohio",
"shortname": "OH",
"info": {"governor": "John Kasich"},
"counties": [
{"name": "Summit", "population": 1234},
{"name": "Cuyahoga", "population": 1337},
],
},
]
data_mixed = spark.createDataFrame(data=data_mixed)
data_mixed.printSchema()
root
|-- counties: array (nullable = true)
| |-- element: map (containsNull = true)
| | |-- key: string
| | |-- value: string (valueContainsNull = true)
|-- info: map (nullable = true)
| |-- key: string
| |-- value: string (valueContainsNull = true)
|-- shortname: string (nullable = true)
|-- state: string (nullable = true)
data_mixed_flat = flatten_test(df, sep=":")
data_mixed_flat.printSchema()
root
|-- shortname: string (nullable = true)
|-- state: string (nullable = true)
|-- counties:name: string (nullable = true)
|-- counties:population: string (nullable = true)
|-- info:governor: string (nullable = true)
data = [
{
"id": 1,
"name": "Cole Volk",
"fitness": {"height": 130, "weight": 60},
},
{"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
{
"id": 2,
"name": "Faye Raker",
"fitness": {"height": 130, "weight": 60},
},
]
df = spark.createDataFrame(data=data)
df.printSchema()
root
|-- fitness: map (nullable = true)
| |-- key: string
| |-- value: long (valueContainsNull = true)
|-- id: long (nullable = true)
|-- name: string (nullable = true)
df_flat = flatten_test(df, sep=":")
df_flat.printSchema()
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- fitness:height: long (nullable = true)
|-- fitness:weight: long (nullable = true)
data_struct = [
(("James",None,"Smith"),"OH","M"),
(("Anna","Rose",""),"NY","F"),
(("Julia","","Williams"),"OH","F"),
(("Maria","Anne","Jones"),"NY","M"),
(("Jen","Mary","Brown"),"NY","M"),
(("Mike","Mary","Williams"),"OH","M")
]
schema = StructType([
StructField('name', StructType([
StructField('firstname', StringType(), True),
StructField('middlename', StringType(), True),
StructField('lastname', StringType(), True)
])),
StructField('state', StringType(), True),
StructField('gender', StringType(), True)
])
df_struct = spark.createDataFrame(data = data_struct, schema = schema)
df_struct.printSchema()
root
|-- name: struct (nullable = true)
| |-- firstname: string (nullable = true)
| |-- middlename: string (nullable = true)
| |-- lastname: string (nullable = true)
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
df_struct_flat = flatten_test(df_struct, sep=":")
df_struct_flat.printSchema()
root
|-- state: string (nullable = true)
|-- gender: string (nullable = true)
|-- name:firstname: string (nullable = true)
|-- name:middlename: string (nullable = true)
|-- name:lastname: string (nullable = true)
"""
# compute Complex Fields (Arrays, Structs and Maptypes) in Schema
complex_fields = dict(
[
(field.name, field.dataType)
for field in df.schema.fields
if type(field.dataType) == ArrayType
or type(field.dataType) == StructType
or type(field.dataType) == MapType
]
)
while len(complex_fields) != 0:
col_name = list(complex_fields.keys())[0]
# print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name])))
# if StructType then convert all sub element to columns.
# i.e. flatten structs
if type(complex_fields[col_name]) == StructType:
expanded = [
col(col_name + "." + k).alias(col_name + sep + k)
for k in [n.name for n in complex_fields[col_name]]
]
df = df.select("*", *expanded).drop(col_name)
# if ArrayType then add the Array Elements as Rows using the explode function
# i.e. explode Arrays
elif type(complex_fields[col_name]) == ArrayType:
df = df.withColumn(col_name, explode_outer(col_name))
# if MapType then convert all sub element to columns.
# i.e. flatten
elif type(complex_fields[col_name]) == MapType:
keys_df = df.select(explode_outer(map_keys(col(col_name)))).distinct()
keys = list(map(lambda row: row[0], keys_df.collect()))
key_cols = list(
map(
lambda f: col(col_name).getItem(f).alias(str(col_name + sep + f)),
keys,
)
)
drop_column_list = [col_name]
df = df.select(
[
col_name
for col_name in df.columns
if col_name not in drop_column_list
]
+ key_cols
)
# recompute remaining Complex Fields in Schema
complex_fields = dict(
[
(field.name, field.dataType)
for field in df.schema.fields
if type(field.dataType) == ArrayType
or type(field.dataType) == StructType
or type(field.dataType) == MapType
]
)
return df
When there are structtypes, and set ingore types to True, it will still provide an error when the nullable fields are different.
The white color font to underline cell differences isn't great in my terminal for example. It'd be totally unreadable in a terminal with a white background.
We should try to make two improvements:
I have two dataframes that appear identical but assert_approx_df_equality is throwing DataFramesNotEqual error. There may be an intermittent going on because this code passed on the development cluster but failed in the test pipeline. Also, changing the precision from 0.001 to 1.0 allows the test to pass, although I don't see any differences in the actual vs. expected output.
actual_df = ...create the dataframe with my component...
expected_data = [
('POINT (2.5 1.5)', 1.0, 1.0, 0.7071067811865476, 2.0, 2.0, False),
('POINT (2.55 2.25)', 2.0, 2.0, 0.14142135623730964, 2.65, 2.35, False),
('POINT (4.75 2.5)', 3.0, 3.0, 0.5, 5.25, 2.5, False),
('POINT EMPTY', 4.0, None, -999.0, float('nan'), float('nan'), False)
]
expected_df = (spark.createDataFrame(expected_data, ["wkt", "point_id", "poly_id", "distance", "X", "Y", "isOnRight"])).sort("point_id")
actual_df.show()
expected_df.show()
assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)
the output of the show commands:
+-----------------+--------+-------+-------------------+----+----+---------+
| wkt|point_id|poly_id| distance| X| Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
| POINT (2.5 1.5)| 1.0| 1.0| 0.7071067811865476| 2.0| 2.0| false|
|POINT (2.55 2.25)| 2.0| 2.0|0.14142135623730964|2.65|2.35| false|
| POINT (4.75 2.5)| 3.0| 3.0| 0.5|5.25| 2.5| false|
| POINT EMPTY| 4.0| null| -999.0| NaN| NaN| false|
+-----------------+--------+-------+-------------------+----+----+---------+
+-----------------+--------+-------+-------------------+----+----+---------+
| wkt|point_id|poly_id| distance| X| Y|isOnRight|
+-----------------+--------+-------+-------------------+----+----+---------+
| POINT (2.5 1.5)| 1.0| 1.0| 0.7071067811865476| 2.0| 2.0| false|
|POINT (2.55 2.25)| 2.0| 2.0|0.14142135623730964|2.65|2.35| false|
| POINT (4.75 2.5)| 3.0| 3.0| 0.5|5.25| 2.5| false|
| POINT EMPTY| 4.0| null| -999.0| NaN| NaN| false|
+-----------------+--------+-------+-------------------+----+----+---------+
The exception shows the last three rows are different though I can't spot the differences.
DataFramesNotEqualError Traceback (most recent call last)
<command-340851985589312> in <module>
50 expected_df.show()
51
---> 52 assert_approx_df_equality(actual_df, expected_df, 0.001, ignore_nullable=True)
/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_approx_df_equality(df1, df2, precision, ignore_nullable)
38 def assert_approx_df_equality(df1, df2, precision, ignore_nullable=False):
39 assert_schema_equality(df1.schema, df2.schema, ignore_nullable)
---> 40 assert_generic_rows_equality(df1, df2, are_rows_approx_equal, [precision])
41
42
/databricks/python/lib/python3.7/site-packages/chispa/dataframe_comparer.py in assert_generic_rows_equality(df1, df2, row_equality_fun, row_equality_fun_args)
62 t.add_row([r1, r2])
63 if allRowsEqual == False:
---> 64 raise DataFramesNotEqualError("\n" + t.get_string())
65
66
DataFramesNotEqualError:
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
| df1 | df2 |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
| Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=2.0, distance=0.7071067811865476, X=3.0, Y=2.0, isOnRight=False) | Row(wkt='POINT (2.5 1.5)', point_id=1.0, poly_id=1.0, distance=0.7071067811865476, X=2.0, Y=2.0, isOnRight=False) |
| Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) | Row(wkt='POINT (2.55 2.25)', point_id=2.0, poly_id=2.0, distance=0.14142135623730964, X=2.65, Y=2.35, isOnRight=False) |
| Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False) | Row(wkt='POINT (4.75 2.5)', point_id=3.0, poly_id=3.0, distance=0.5, X=5.25, Y=2.5, isOnRight=False) |
| Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False) | Row(wkt='POINT EMPTY', point_id=4.0, poly_id=None, distance=-999.0, X=nan, Y=nan, isOnRight=False) |
+------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------+
and the two schemas compared:
root
|-- wkt: string (nullable = true)
|-- point_id: double (nullable = true)
|-- poly_id: double (nullable = true)
|-- distance: double (nullable = true)
|-- X: double (nullable = true)
|-- Y: double (nullable = true)
|-- isOnRight: boolean (nullable = true)
root
|-- wkt: string (nullable = true)
|-- point_id: double (nullable = true)
|-- poly_id: double (nullable = true)
|-- distance: double (nullable = true)
|-- X: double (nullable = true)
|-- Y: double (nullable = true)
|-- isOnRight: boolean (nullable = true)
When calling the assert_df_equality and assert_approx_df_equality it will be good to have the option to not display the get_string(). Sometimes the output might be to long or truncated.I think this could be a good addition.
Example of the idea:
DataFramesNotEqualError("\n" + t.get_string()) if full_log else DataFramesNotEqualError
Where full_log is boolean
Hi there,
Thank you for the effort on this library. Perhaps it would be great to add an extra assert that would ignore the order in dataframe.
Some of the functions, for example df.dropDuplicates() are unpredictable in order. It would be great to have something that would just ignore the order, or order all record by a given column...
Thank you for any consideration!
Good luck!
Related to #70 , when metadata is different between two schemas, the printout in the error logs don't explicitly show the difference in metadata since it only uses the __repr__
from pyspark's StructField
.
Hi, I'm using chispa for tests. I found that it paints some columns red in a report, even if they are not incorrect. Look at the example below. I have size_cd and retail_week columns reshufled, which is causing assertion error. But chispa also paints red all other columns which have different nullable option even though I'm using ignore nullable parameter.
I think it would be better to paint such columns blue. Right now it feels like chispa does not work as intended with ignore nullable parameter.
The mack project has a bunch of nice badges:
We should add nice badges to this project README as well.
I think it could be useful to run unit tests against multiple versions of Python on each PR, to gain extra confidence in the proposed changes of a PR. This also builds extra confidence that chispa
actually works on the python versions as specified in pyproject.toml
, and could prevent issues such as #78 in the future.
I have created a draft PR to solve this; #81
Since Spark 2.3 there is the Pyspark function eqNullSafe, this seems a much better way to compare columns and also can be used to compare dataframes.
Advantages:
For dataframe it would mean that there has to be some sort of loop over columns and then a reduce to check all member of the resulting column are true. I think it is worth the change due to the 2 reasons given above,
Maybe I'm the outlier, but I consider the more intuitive check -- especially for testing purposes -- to ignore order. If some function produces a DataFrame that I want to check, I care about the contents. And by default, Spark offers no guarantees on row order unless your plan has an explicit .orderBy()
. So relying on the stability of row order in the absence of an explicit order by clause is a recipe for surprises, much like it is in SQL.
In fact, I don't think .collect()
even provides any guarantees that the row order of the resulting array will match the row order of the original DataFrame---again, unless the DataFrame has an explicit ordering specified. It's theoretically possible, for example, that you could call spark.range(3).collect()
twice and get different row orders each time. So if you're relying on .collect()
to preserve order without explicit ordering on the original DataFrames, then I would say that's technically incorrect.
By the way, in your own usages of this library (or the Scala equivalent), how often do you compare DataFrames where you care about the row order? I'm curious to see a few examples of that.
Originally posted by @nchammas in #19 (comment)
Right now, the chispa
package has a hard dependency on the pyspark
making it hard to use with Databricks runtime, or other compatible Spark runtime. Instead, this package should either rely on implicit dependency completely, or use something like findspark package, something like done in spark-testing-base or in pytest-spark
I created a pull request and confirmed that chispa is fully compliant wth Spark Connect: #86
We don't want to make chispa depend on Spark connect because then it'd have to depend on PySpark >= 3.5. We want chispa to support many old Spark versions for users that are still on legacy Spark runtimes.
It would be nice to test chispa+Spark Connect in the CI tho. We don't want to ever add any chispa code that wouldn't work with Spark Connect. We want all Spark Connnect users to always have access to chispa as well.
Any thoughts on how to add Spark Connect to the CI test suite?
When using ignore_nullable=True
chispa still sees differences in ArrayType because there's a nullable difference in the inner type:
StructField(my_arr_col,ArrayType(StringType,false),false)
StructField(my_arr_col,ArrayType(StringType,true),true)
Sometimes we just need to check if two dataframes have equal values irrespective of their schemas. How about introducing an argument ignore_schema
to skip schema check?
https://github.com/MrPowers/chispa/blob/main/chispa/dataframe_comparer.py#L23
This project is setup for Poetry development.
Is there a way to setup this project for conda development as well? I want to keep using Poetry for my personal development and to build / release the wheel files.
Is there a way to structure the projects so ppl with conda installed on their machine can get a virtual environment properly setup? Conda is the other popular Python virtual env tool correct?
See here for an example: MrPowers/quinn#98
This is a good way we make sure we keep supporting legacy versions of Spark well for a long time.
We always want to support lots of Spark versions.
Let's add some unit tests that highlight the limitations of this library.
Situations like DataFrames with nested schemas, nested arrays, NaN values, and any other weird edge case that we need to support.
This will hopefully encourage the open source community to fill the gaps.
It would be great if we could avoid column order checking when using assert_approx_df_equality
E chispa.dataframe_comparer.SchemasNotEqualError:
E +------------------------------------------+------------------------------------------+
E | schema1 | schema2 |
E +------------------------------------------+------------------------------------------+
E | StructField(second_name,StringType,true) | StructField(second_name,StringType,true) |
E | StructField(id,LongType,true) | StructField(id,LongType,true) |
E | StructField(floor,LongType,true) | StructField(floor,LongType,true) |
E | StructField(first_name,StringType,true) | StructField(first_name,StringType,true) |
E +------------------------------------------+------------------------------------------+
didn't show schema2 has one more column 'age'
importing chispa
can cause issues on Python 3.10 due to the use of six
1.15.0
. See also secdev/scapy#3502 and this blogpost. As the blogpost patches, the issue is patched in six
1.16.0
.
When trying to assert_df_equality
with allow_nan_equality=True
, if the both DataFrames hold an array that contains some nan
values then the comparer fails, even if the nan
s are in the same place.
An additional check should take place here to compare the array elements.
Lines 16 to 20 in 500793e
Again, I can help contribute to this one but it won't be until next week.
Here's a comparison example:
assert_df_equality
is very handy for catching regression in pyspark ETL functions.
But debugging is hard when there are many columns (e.g. more than 5-10 with long names). Pretty table row comparison output records spread over multiple lines. It becomes difficult to manually identify the discrepancies.
I'd like to request a feature which could make this easier: optional highlighting of diffs. When two rows don't match and are printed in red text, the mismatching values could also be highlighted e.g. with background color yellow.
To highlight those values, rather than or in addition to testing for equality, I think we'd need to identify and return them for PrettyTable display to handle differently. E.g. collect column keys with mismatching values here and here, then search for their corresponding tokens and color them differently in the strings here and here.
Maybe try colorama for background highlighting.
If the schema contains arrays, and the arrays contain types with different nullability, the "ignore_nullable" option doesn't work.
Consider changing to the following:
def are_schemas_equal_ignore_nullable(s1, s2):
if len(s1) != len(s2):
return False
zipped = list(six.moves.zip_longest(s1, s2))
for sf1, sf2 in zipped:
if sf1.name != sf2.name or ~check_type_equal_ignore_nullable(sf1, sf2)
return False
return True
def check_type_equal_ignore_nullable(c1, c2):
"""Checks column types ignoring nullables."""
if c1.typeName() == c2.typeName():
# Account for array types by inspecting elementType.
if c1.typeName() == 'array':
return c1.elementType == c2.elementType
else:
return True
else:
return False
I can have a go at contributing if you're OK with it?
Mutable default arguments in Python (like []
) are slightly dangerous. If you mutate transforms
inside this function, the mutations will persist to the next call of the function.
The typical solution to this is to set the default to None
and then inside the function convert to []
.
Originally posted by @nchammas in #16 (comment)
Sometimes want to perform DataFrame equality comparisons without considering the nullable flag. Might as well add options to ignore column names and perform an unordered comparison, similar to spark-fast-tests.
See the change that was made in quinn: MrPowers/quinn#129
When this variable is set, a lot of spark-internal things are disabled or simplified. It may significantly increase speed of testing. One may check this variable in spark code. We may set it up automatically before creating a spark session.
I used this parameter in 9.2 but it's no longer there in 9.3. Why was this removed and does it mean I can't perform unit-tests without comparing types any longer?
It would be nice to develop chispa so we can make a 1.0 release.
We might even want to expose a different interface. Something like this:
@dataclass
class MyFormats:
mismatched_rows = ["light_yellow"]
matched_rows = ["cyan", "bold"]
mismatched_cells = ["purple"]
matched_cells = ["blue"]
my_chispa = Chispa(formats=MyFormats())
my_chispa.assert_df_equality(actual_df, expected_df)
The user could inject the my_chispa
object in their tests as follows:
@pytest.fixture()
def my_chispa():
return Chispa(formats=MyFormats())
def test_shows_assert_basic_rows_equality(my_chispa):
...
my_chispa.assert_basic_rows_equality(df1.collect(), df2.collect())
It's worth contemplating at least.
I have a test where i define how the productive table will be created. I'm setting some comments to the columns so the user that consumes this table can understand what that column does. The problem is that when I make a test of that table with a custom dataframe, chispa throws me an exception due to schema mismatch.
Example:
spark.sql("""
CREATE TABLE IF NOT EXISTS foo (
id LONG COMMENT "a comment",
value INT
)
""")
spark.sql("INSERT INTO foo values (1,1)")
df = spark.table("foo")
schema = T.StructType([
T.StructField("id", T.LongType(), True),
T.StructField("value", T.IntegerType(), True),
])
expected = spark.createDataFrame(data=[(1, 1)], schema=schema)
assert_df_equality(df, expected)
The assertion fails for the schema, the output shows that value is identical (because it has no metadata) but the id is not equal (but it seems identical). If you remove the "COMMENT" section from the table creation, the test pass. Being forced to add the metadata in the struct type its way more tedious, is there a chance to ignore the metadata using a boolean (ignore_schema_metadata
)?
I feel like this would be quite useful. Were there any design choices for why it wasn't included or would this be a useful addition?
chispa/chispa/dataframe_comparer.py
Lines 38 to 40 in 500793e
When using the underline_cells
flag in assert_df_equality
if the dataframes have different amounts of rows, the assertion function throws an exception.
from decimal import Decimal
from pyspark.sql import SparkSession
from pyspark.sql import types as T
from chispa.dataframe_comparer import assert_df_equality
spark = SparkSession.builder.getOrCreate()
schema = T.StructType(
[
T.StructField("id", T.StringType(), nullable=False),
T.StructField("balance", T.DecimalType(38,6), nullable=True),
]
)
df1 = spark.createDataFrame(
[
[1, None],
[2, Decimal(1.0)],
],
schema=schema,
)
df2 = spark.createDataFrame(
[
[1, None],
[2, Decimal(1.0)],
[3, Decimal(100)],
],
schema=schema,
)
This gives two dataframes, with different row counts:
df1.show()
+---+--------+
| id| balance|
+---+--------+
| 1| null|
| 2|1.000000|
+---+--------+
df2.show()
+---+----------+
| id| balance|
+---+----------+
| 1| null|
| 2| 1.000000|
| 3|100.000000|
+---+----------+
When calling just assert_df_equality
you get the expected comparison:
assert_df_equality(df1, df2)
---------------------------------------------------------------------------
DataFramesNotEqualError Traceback (most recent call last)
Cell In [16], line 1
----> 1 assert_df_equality(df1, df2)
File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
24 assert_generic_rows_equality(
25 df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
26 else:
---> 27 assert_basic_rows_equality(
28 df1.collect(), df2.collect(), underline_cells=underline_cells)
File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:25, in assert_basic_rows_equality(rows1, rows2, underline_cells)
23 else:
24 t.add_row([r1, r2])
---> 25 raise chispa.DataFramesNotEqualError("\n" + t.get_string())
DataFramesNotEqualError:
+------------------------------------------+--------------------------------------------+
| df1 | df2 |
+------------------------------------------+--------------------------------------------+
| Row(id='1', balance=None) | Row(id='1', balance=None) |
| Row(id='2', balance=Decimal('1.000000')) | Row(id='2', balance=Decimal('1.000000')) |
| None | Row(id='3', balance=Decimal('100.000000')) |
+------------------------------------------+--------------------------------------------+
but when adding underline_cells
you get an exception:
assert_df_equality(df1, df2, underline_cells=True)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In [17], line 1
----> 1 assert_df_equality(df1, df2, underline_cells=True)
File /opt/conda/lib/python3.9/site-packages/chispa/dataframe_comparer.py:27, in assert_df_equality(df1, df2, ignore_nullable, transforms, allow_nan_equality, ignore_column_order, ignore_row_order, underline_cells, ignore_metadata)
24 assert_generic_rows_equality(
25 df1.collect(), df2.collect(), are_rows_equal_enhanced, [True], underline_cells=underline_cells)
26 else:
---> 27 assert_basic_rows_equality(
28 df1.collect(), df2.collect(), underline_cells=underline_cells)
File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:21, in assert_basic_rows_equality(rows1, rows2, underline_cells)
19 else:
20 if underline_cells:
---> 21 t.add_row(__underline_cells_in_row(
22 r1=r1, r2=r2, row_column_names=row_column_names, num_columns=num_columns))
23 else:
24 t.add_row([r1, r2])
File /opt/conda/lib/python3.9/site-packages/chispa/rows_comparer.py:73, in __underline_cells_in_row(r1, r2, row_column_names, num_columns)
70 else:
71 append_str = ", "
---> 73 if r1[column] != r2[column]:
74 r1_string += underline_text(
75 f"{column}='{r1[column]}'") + f"{append_str}"
76 r2_string += underline_text(
77 f"{column}='{r2[column]}'") + f"{append_str}"
TypeError: 'NoneType' object is not subscriptable
This test shouldn't be passing:
def it_does_not_throw_with_different_schema():
data1 = [(1.0, "jose"), (1.1, "li"), (1.2, "laura"), (None, None)]
df1 = spark.createDataFrame(data1, ["num", "expected_name"])
data2 = [("li", 1.05), ("laura", 1.2), (None, None), ("jose", 1.0)]
df2 = spark.createDataFrame(data2, ["another_name", "same_num"])
assert_approx_df_equality(df1, df2, 0.1, ignore_schema=True)
ignore_row_order=False
isn't set, so this shouldn't be passing.
This is because of empty set returned in d1.keys() & d2.keys(), when the column names are different. The conditions are actually not checked at all and returning True
.
Hello, I have an issue with the usage of ignore_metadata=True while comparing two schemas. Both of them have metadata defined specifically or by an empty {}. When ignore_metadata = False, it seems that the mismatches based on metadata are found. But when I set ignore_metadata=True, the error I get is:
AssertionError: assert None where None = assert_schema_equality(StructType([StructField('...))], StructType([StructField(...)], ignore_nullable=True, ignore_metadata=True).
Can you help me on that one? Thanks in advance!
Non-essential, would help to improve readability of code by ensuring the style conforms to PEP8 standards, including:
These comparers should be abstracted to separate files and fully unit tested.
DataType compares are more complex than they seem at first blush. There are a bunch of nested options that deserve full unit testing.
using assert_df_equality
produces a warning in the console about pkg_resources being deprecated.
======================================================================================= warnings summary ========================================================================================
.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121
/Users/ethnhll/Projects/dsc-publish/.venv/lib/python3.9/site-packages/pkg_resources/__init__.py:121: DeprecationWarning: pkg_resources is deprecated as an API
warnings.warn("pkg_resources is deprecated as an API", DeprecationWarning)
-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
================================================================================= 1 passed, 1 warning in 11.76s =================================================================================
Looking at the code where pkg_resources
is referenced, it seems like it is not currently in use.
Line 44 in 2f1de26
I'm not familiar enough with the build processes at play for this project so I can't say for sure that removing the dependency on pkg_resources in prettytable.py
won't have unintended side effects, but I can open a PR that removes it.
Hello,
First of all, thank you for this great package! It's really making my test code cleaner.
One issue I've encountered is that when I perform assert_df_equality()
on two dataframes where each contains a row with NaN values, the test fails.
I can see why it would fail as float("nan") == float("nan")
evaluates to False
. However, if I convert the dataframes to pandas df first, then use pd.testing.assert_frames_equal
, the test passes. It would be awesome if assert_df_equality
can handle NaN cases as well.
Thanks!
As noted in this pull request (#68), we want to give the user the ability to control the formatting of the output.
The formatting should be easy to configure for a given test and also easy to set globally for the entire test suite.
Here are the main concepts we want to model:
The formatting should let the user configure color, underline, and bold.
These settings should be globally applicable to all the interfaces in the project including schema comparisons, DataFrame comparisons, StructField comparisons, and column comparisons.
Something like this could work:
{
"mismatched_rows": ["red", "bold"],
"matched_rows": "blue",
"mismatched_cells": ["white", "underline"],
"print_dif": True,
"print_mismatched_cols": True
}
The user should be able to set this globally and then override for a given test (they should be able to partially override).
The user should also be able to ignore this entirely and just rely on the built-in defaults.
Hopefully we can make the outputs look good on both Mac and Windows machines.
Is there a way you could restructure this so that if allow_nan_equality
is checked on specific row comparisons? It would cut down on some code duplication.
Originally posted by @nchammas in #16 (comment)
Dear team,
I am trying to integrate this library in my build process for unit test and got below-mentioned error during the build process. We are in the python 2.7 environments. Can you please help here.
Traceback (most recent call last):
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/loader.py", line 418, in loadTestsFromName
addr.filename, addr.module)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 47, in importFromPath
return self.importFromDir(dir_path, fqname)
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/nose/importer.py", line 94, in importFromDir
mod = load_module(part_fqname, fh, filename, desc)
File "/var/lib/jenkins/workspace/myproject/test/test_functions.py", line 3, in
from chispa import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/init.py", line 1, in
from .dataframe_comparer import DataFramesNotEqualError, assert_df_equality, assert_approx_df_equality
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/dataframe_comparer.py", line 2, in
from chispa.bcolors import *
File "/var/lib/jenkins/workspace/myproject/build/venv/lib/python2.7/site-packages/chispa/bcolors.py", line 28
def blue(s: str) -> str:
Reconfigure the CI to run the test suite on pull requests.
This will make pull request review easier.
A declarative, efficient, and flexible JavaScript library for building user interfaces.
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
An Open Source Machine Learning Framework for Everyone
The Web framework for perfectionists with deadlines.
A PHP framework for web artisans
Bring data to life with SVG, Canvas and HTML. ๐๐๐
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
Some thing interesting about web. New door for the world.
A server is a program made to process requests and deliver data to clients.
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
Some thing interesting about visualization, use data art
Some thing interesting about game, make everyone happy.
We are working to build community through open source technology. NB: members must have two-factor auth.
Open source projects and samples from Microsoft.
Google โค๏ธ Open Source for everyone.
Alibaba Open Source for everyone
Data-Driven Documents codes.
China tencent open source team.