Skip to content

Commit

Permalink
Workshop changes (#88)
Browse files Browse the repository at this point in the history
* Update example_notebook.py

* Update example_notebook.py

* Update example_notebook.py
  • Loading branch information
ArthurKordes authored Feb 12, 2025
1 parent f88bbab commit 0c6b1f8
Showing 1 changed file with 69 additions and 4 deletions.
73 changes: 69 additions & 4 deletions scripts/example_notebook.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,93 @@
# Databricks notebook source
pip install dq-suite-amsterdam==0.11.10
# MAGIC %md
# MAGIC Notebooks demonstrates the usage of dq-suite-amsterdam.
# MAGIC
# MAGIC **Requirements** to make this run:
# MAGIC - Create and start a personal compute
# MAGIC - Create the output tables using this script: https://github.com/Amsterdam/dq-suite-amsterdam/blob/main/scripts/data_quality_tables.sql
# MAGIC - Uploaded data quality rules are defined in here `dq_workshop.json`
# MAGIC
# MAGIC

# COMMAND ----------

# Install dq-suite
%pip install dq-suite-amsterdam==0.11.10


# COMMAND ----------

dbutils.library.restartPython()

# COMMAND ----------

# MAGIC %sql
# MAGIC -- explore the data
# MAGIC SELECT * FROM samples.nyctaxi.trips LIMIT 10
# MAGIC

# COMMAND ----------

df = spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1000")
# MAGIC
# MAGIC %md
# MAGIC ## Run dq-suite validation
# MAGIC

# COMMAND ----------

# path to json script
# -----replace with your own path
dq_rule_json_path = "/Workspace/Users/[email protected]/dq_workshop.json"


# COMMAND ----------

from dq_suite.validation import run_validation
# load data into a pyspark dataframe
df = spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1000")
df.summary().show()


# COMMAND ----------

from dq_suite.validation import run_validation

# run the dataframe validation given the rules defined in dq_rule_json_path
run_validation(
json_path=dq_rule_json_path,
df=df,
spark_session=spark,
catalog_name="dpxx_dev",
catalog_name="dpxx_dev", # -----replace with your own catalog name
table_name="nyc_taxi",
validation_name="dq_workshop",
)


# COMMAND ----------

# MAGIC
# MAGIC %md
# MAGIC ## Explore output
# MAGIC

# COMMAND ----------

# MAGIC %sql
# MAGIC -- check regel van json defined in dq_rule_json_path
# MAGIC SELECT * FROM dpxx_dev.data_quality.regel
# MAGIC

# COMMAND ----------

# MAGIC %sql
# MAGIC -- check validatie output van run_validation()
# MAGIC SELECT * FROM dpxx_dev.data_quality.validatie
# MAGIC

# COMMAND ----------

# MAGIC
# MAGIC %md
# MAGIC ## Exercise
# MAGIC - Check which rules could be adjusted in dq_workshop.json file to make it work with the data
# MAGIC - Add an additional rule to be checked
# MAGIC - Check how the results are updated after multiple runs

0 comments on commit 0c6b1f8

Please sign in to comment.