Skip to content

Commit

Permalink
Updated the Categorical range constraint suggestions to use a new cla…
Browse files Browse the repository at this point in the history
…ss called ConstraintSuggestionWithValue (#492)

- The new class contains an additional field, which can be used to store a value associated with the constraint.
- This is useful for the categorical range rule, to store the categories. This way, the values can be extracted without needing to parse the constraint code which is stored as a string.
  • Loading branch information
rdsharma26 committed Apr 16, 2024
1 parent 1b80628 commit d624328
Show file tree
Hide file tree
Showing 16 changed files with 81 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,33 @@ import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.rules.ConstraintRule
import com.google.gson.{GsonBuilder, JsonArray, JsonObject}

case class ConstraintSuggestion(
sealed trait ConstraintSuggestion {
val constraint: Constraint
val columnName: String
val currentValue: String
val description: String
val suggestingRule: ConstraintRule[ColumnProfile]
val codeForConstraint: String
}

case class CommonConstraintSuggestion(
constraint: Constraint,
columnName: String,
currentValue: String,
description: String,
suggestingRule: ConstraintRule[ColumnProfile],
codeForConstraint: String
)
) extends ConstraintSuggestion

case class ConstraintSuggestionWithValue[T](
constraint: Constraint,
columnName: String,
currentValue: String,
description: String,
suggestingRule: ConstraintRule[ColumnProfile],
codeForConstraint: String,
value: T
) extends ConstraintSuggestion

object ConstraintSuggestions {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.analyzers.{DataTypeInstances, Histogram}
import com.amazon.deequ.analyzers.DataTypeInstances
import com.amazon.deequ.analyzers.Histogram
import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.complianceConstraint
import com.amazon.deequ.metrics.DistributionValue
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue
import org.apache.commons.lang3.StringEscapeUtils
import com.amazon.deequ.metrics.DistributionValue

/** If we see a categorical range for a column, we suggest an IS IN (...) constraint */
case class CategoricalRangeRule(
Expand Down Expand Up @@ -53,15 +55,15 @@ case class CategoricalRangeRule(
override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val valuesByPopularityNotNull = profile.histogram.get.values.toArray
.filterNot { case (key, _) => key == Histogram.NullFieldReplacement }
val valuesByPopularity = categorySorter(valuesByPopularityNotNull)
val valuesByPopularity = categorySorter(valuesByPopularityNotNull).map { case (key, _) => key }

val categoriesSql = valuesByPopularity
// the character "'" can be contained in category names
.map { case (key, _) => key.replace("'", "''") }
.map { _.replace("'", "''") }
.mkString("'", "', '", "'")

val categoriesCode = valuesByPopularity
.map { case (key, _) => StringEscapeUtils.escapeJava(key) }
.map { StringEscapeUtils.escapeJava }
.mkString(""""""", """", """", """"""")

val description = s"'${profile.column}' has value range $categoriesSql"
Expand All @@ -71,13 +73,14 @@ case class CategoricalRangeRule(
Check.IsOne,
columns = List(profile.column))

ConstraintSuggestion(
ConstraintSuggestionWithValue[Seq[String]](
constraint,
profile.column,
"Compliance: 1",
description,
this,
s""".isContainedIn("${profile.column}", Array($categoriesCode))"""
s""".isContainedIn("${profile.column}", Array($categoriesCode))""",
valuesByPopularity.toSeq
)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package com.amazon.deequ.suggestions.rules
import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.completenessConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** If a column is complete in the sample, we suggest a NOT NULL constraint */
Expand All @@ -32,7 +33,7 @@ case class CompleteIfCompleteRule() extends ConstraintRule[ColumnProfile] {

val constraint = completenessConstraint(profile.column, Check.IsOne)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"Completeness: " + profile.completeness.toString,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions._
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** Abstract base class for all constraint suggestion rules */
abstract class ConstraintRule[P <: ColumnProfile] {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.analyzers.{DataTypeInstances, Histogram}
import com.amazon.deequ.analyzers.DataTypeInstances
import com.amazon.deequ.analyzers.Histogram
import com.amazon.deequ.constraints.Constraint.complianceConstraint
import com.amazon.deequ.metrics.DistributionValue
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestionWithValue
import org.apache.commons.lang3.StringEscapeUtils

import scala.math.BigDecimal.RoundingMode

/** If we see a categorical range for most values in a column, we suggest an IS IN (...)
Expand Down Expand Up @@ -63,15 +66,15 @@ case class FractionalCategoricalRangeRule(

val valuesByPopularityNotNull = topCategories.toArray
.filterNot { case (key, _) => key == Histogram.NullFieldReplacement }
val valuesByPopularity = categorySorter(valuesByPopularityNotNull)
val valuesByPopularity = categorySorter(valuesByPopularityNotNull).map { case (key, _) => key }

val categoriesSql = valuesByPopularity
// the character "'" can be contained in category names
.map { case (key, _) => key.replace("'", "''") }
.map { _.replace("'", "''") }
.mkString("'", "', '", "'")

val categoriesCode = valuesByPopularity
.map { case (key, _) => StringEscapeUtils.escapeJava(key) }
.map { StringEscapeUtils.escapeJava }
.mkString(""""""", """", """", """"""")

val p = ratioSums
Expand All @@ -89,14 +92,15 @@ case class FractionalCategoricalRangeRule(
val constraint = complianceConstraint(description, columnCondition, _ >= targetCompliance,
hint = Some(hint), columns = List(profile.column))

ConstraintSuggestion(
ConstraintSuggestionWithValue[Seq[String]](
constraint,
profile.column,
"Compliance: " + ratioSums.toString,
description,
this,
s""".isContainedIn("${profile.column}", Array($categoriesCode),
| _ >= $targetCompliance, Some("$hint"))""".stripMargin.replaceAll("\n", "")
| _ >= $targetCompliance, Some("$hint"))""".stripMargin.replaceAll("\n", ""),
valuesByPopularity.toSeq
)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.maxConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
Expand All @@ -41,7 +40,7 @@ case class HasMax() extends ConstraintRule[ColumnProfile] {
val description = s"'${profile.column}' <= $maximum"
val constraint = maxConstraint(profile.column, _ == maximum)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
s"Maximum: $maximum",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.maxLengthConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.StringColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

case class HasMaxLength() extends ConstraintRule[ColumnProfile] {
Expand All @@ -35,7 +35,7 @@ case class HasMaxLength() extends ConstraintRule[ColumnProfile] {

val constraint = maxLengthConstraint(profile.column, _ <= maxLength)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"MaxLength: " + profile.completeness.toString,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.meanConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
Expand All @@ -41,7 +40,7 @@ case class HasMean() extends ConstraintRule[ColumnProfile] {
val description = s"'${profile.column}' <= $mean"
val constraint = meanConstraint(profile.column, _ == mean)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
s"Mean: $mean",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.minConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** If we see only non-negative numbers in a column, we suggest a corresponding
Expand All @@ -40,7 +40,7 @@ case class HasMin() extends ConstraintRule[ColumnProfile] {
val description = s"'${profile.column}' >= $minimum"
val constraint = minConstraint(profile.column, _ == minimum)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
s"Minimum: $minimum",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.minLengthConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.StringColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

case class HasMinLength() extends ConstraintRule[ColumnProfile] {
Expand All @@ -36,7 +36,7 @@ case class HasMinLength() extends ConstraintRule[ColumnProfile] {

val constraint = minLengthConstraint(profile.column, _ >= minLength)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"MinLength: " + minLength,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.standardDeviationConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion
import com.amazon.deequ.checks

/** If we see only non-negative numbers in a column, we suggest a corresponding
* constraint
Expand All @@ -41,7 +40,7 @@ case class HasStandardDeviation() extends ConstraintRule[ColumnProfile] {
val description = s"'${profile.column}' <= $stdDev"
val constraint = standardDeviationConstraint(profile.column, _ == stdDev)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
s"stdDev: $stdDev",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.complianceConstraint
import com.amazon.deequ.profiles.{ColumnProfile, NumericColumnProfile}
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.profiles.NumericColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** If we see only non-negative numbers in a column, we suggest a corresponding constraint */
Expand All @@ -45,7 +47,7 @@ case class NonNegativeNumbersRule() extends ConstraintRule[ColumnProfile] {
case _ => "Error while calculating minimum!"
}

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"Minimum: " + minimum,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,10 @@
package com.amazon.deequ.suggestions.rules

import com.amazon.deequ.constraints.Constraint.completenessConstraint
import com.amazon.deequ.profiles._
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

import scala.math.BigDecimal.RoundingMode

/**
Expand Down Expand Up @@ -47,7 +49,7 @@ case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] {

val description = s"'${profile.column}' has less than $boundInPercent% missing values"

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"Completeness: " + profile.completeness.toString,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.ConstrainableDataTypes
import com.amazon.deequ.constraints.Constraint.dataTypeConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

/** If we detect a non-string type, we suggest a type constraint */
Expand All @@ -46,7 +47,7 @@ case class RetainTypeRule() extends ConstraintRule[ColumnProfile] {

val constraint = dataTypeConstraint(profile.column, typeToCheck, Check.IsOne)

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"DataType: " + profile.dataType.toString,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package com.amazon.deequ.suggestions.rules
import com.amazon.deequ.checks.Check
import com.amazon.deequ.constraints.Constraint.uniquenessConstraint
import com.amazon.deequ.profiles.ColumnProfile
import com.amazon.deequ.suggestions.CommonConstraintSuggestion
import com.amazon.deequ.suggestions.ConstraintSuggestion

/**
Expand All @@ -40,7 +41,7 @@ case class UniqueIfApproximatelyUniqueRule() extends ConstraintRule[ColumnProfil
val constraint = uniquenessConstraint(Seq(profile.column), Check.IsOne)
val approximateDistinctness = profile.approximateNumDistinctValues.toDouble / numRecords

ConstraintSuggestion(
CommonConstraintSuggestion(
constraint,
profile.column,
"ApproxDistinctness: " + approximateDistinctness.toString,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,19 @@ class ConstraintSuggestionsIntegrationTest extends WordSpec with SparkContextSpe
.instance.startsWith(s"'marketplace' has value range")
}

// Categorical range for "marketplace" with values
assert(
constraintSuggestionResult.constraintSuggestions
.getOrElse("marketplace", Seq.empty)
.exists {
case value: ConstraintSuggestionWithValue[Seq[String]] =>
val constraintWithValue = value.value
println(constraintWithValue)
constraintWithValue.sorted == categories.toSeq.sorted
case _ => false
}
)

// IS NOT NULL for "measurement"
assertConstraintExistsIn(constraintSuggestionResult) { (analyzer, assertionFunc) =>
analyzer == Completeness("measurement") && assertionFunc(1.0)
Expand Down

0 comments on commit d624328

Please sign in to comment.