Build your own rich data quality pipelines directly on dataframes using Owl's DQ suite
Owl Predictive Data Science + Spark Scale
![clean](https://camo.githubusercontent.com/979f851a67733cc6554f841032f35316f4e7ffbd9d182b89710654819a13e619/68747470733a2f2f6f776c2d616e616c79746963732e636f6d2f696d672f6c616e64696e672f636c65616e2d737461636b65642e706e67)
val df = Seq(
("Kirk", "brian", "11", "222-33-4444", "410-489-8095"),
("kirk", "kirk", "22", "111-33-4444", "410-489-8095"),
("bryan", "Kirk", "11", "555-33-4444", "443-885-8989"),
("Dan", "dan", "33", "666-33-4444", "(212) 333-4444"),
("", "brian", "11", "222-33-4444", "(212) 333-4444"),
("kirk", "kirk", "21", "111-33-4444", "(212) 333-4444"),
("brianne","kirk", "18", "555-33-4444", "3015859966"),
("dan", "dan", "31", "666-33-4444", "3015859966"),
("kirk", "brian", "66", "222-33-4444", "3015859966"),
("kirk", null, "age-27", "111-33-4444","3015859966"),
("brianne","Kirt", "13", "555-33-4444", "3015859966"),
("dan", "DAN", "35", "666-33-4444", "3015859966"),
("kirk", "brian", "16", "222-33-4444", "410-489-8095"),
("kirk", "kirk", "27", "111-33-4444", "410-489-8095"),
("brian", "kirt", "18", "17", "3015859966"),
("dan", "dan", "39", "666-33-4444", "3015859966"),
("", "brian", "12", "222-33-4444", "3015859966"),
("kirk", "kirk", "22", "111-33-4444", "3015859966"),
("", "brian", "kirk", "1", "3015859966"),
("dan", null, "34", "666-33-4444", "3015859966"),
("brian", "brian", "10", "222-33-4444", "3015859966")
).toDF("first","last","age","ssn","phone")
// Clean this dataset in 1 line!
val clean = new Owl(df).clean
// create a simple dataframe
val df = Seq(
("Kirk", "Haslbeck", "2018-02-24 8:30:45", 18),
("Kirk", "Haslbeck", "2018-02-23 8:30:45", 11),
("Kirk", "Haslbeck", "2018-02-22 8:30:45", 10),
("Kirk", "Haslbeck", "2018-02-21 8:30:45", 12),
("Kirk", "Haslbeck", "2018-02-20 8:30:45", 10),
("Cat", "Stevens", "2018-02-20 8:30:45", 44),
("Cat", "Stevens", "2018-02-20 8:30:45", 47),
("Cat", "Stevens", "2018-02-24 8:30:45", 144)
).toDF("fname","lname", "app_date", "price")
// Owl Outlier Detection in 1 line
val outlierDF = new Owl(df, props)
.outliersDF(dlKey = "fname", dlLb = 5, dateCol = "app_date", runDate = "2018-02-24")
Key |
Column |
Value |
Prediction |
Confidence |
Cat |
price |
144 |
45.5 |
34 |
Kirk |
price |
18 |
10.5 |
77 |
val assetClass =
Seq("STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"STOCK","OPTION","FUTURE","STOCK","OPTION","FUTURE","STOCK","STOCK", "FUTURE",
"SWAP", "OPTION", "STOCK","SWAP", "SWAP", "FUTRS", "STOCK", "OPTION","FUTURE",
"OPTION", "STOCK" )
val df = assetClass.toDF("ASSET")
val outliers = new Owl(df).outliersDF("ASSET")
Key |
Column |
Value |
Prediction |
Confidence |
|
ASSET |
FUTRS |
FUTURES |
|
Rules and Repair Strategies
// create OwlSpark Context
val owl = new Owl(df)
// add 2 rules
owl.addRule("kirk fname rule", "first = 'Kirk'")
owl.addRule("age less than 105", "age < 105")
// Execute the rules in the Pipeline
owl.rulesDF
![duplicates](https://camo.githubusercontent.com/ebc39a89e7af0b0eb3727e8b135dd0104817d94dcde5ff793f3d32e1486da206/68747470733a2f2f6f776c2d616e616c79746963732e636f6d2f696d672f647570652d696d672e706e67)
val l = Seq(
("Kirk", "Haslbeck", "2018-02-24 8:30:45", 18),
("Kirk", "Hasselbach", "2018-02-23 8:30:45", 11),
("Kirt", "Haslbeck", "2018-02-22 8:30:45", 10),
("Kirk", "hasselback", "2018-02-21 8:30:45", 12),
("kirk", "Haslbeck", "2018-02-20 8:30:45", 10)
)
val df = l.toDF("fname","lname", "app_date", "price")
val owl = new Owl(df).dupesDF
distance |
records |
0.16 |
DUPE [score: 83 ][Kirk,Hasselbach,2018-02-23 8:30:45] [Kirt,Haslbeck,2018-02-22 8:30:45] |
0.083 |
DUPE [score: 91 ][Kirk,Haslbeck,2018-02-24 8:30:45] [Kirt,Haslbeck,2018-02-22 8:30:45] |
0.083 |
DUPE [score: 91 ][Kirk,Haslbeck,2018-02-24 8:30:45] [Kirk,Hasselbach,2018-02-23 8:30:45] |