-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update example_notebook.py * Update example_notebook.py * Update example_notebook.py
- Loading branch information
1 parent
f88bbab
commit 0c6b1f8
Showing
1 changed file
with
69 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,28 +1,93 @@ | ||
# Databricks notebook source | ||
pip install dq-suite-amsterdam==0.11.10 | ||
# MAGIC %md | ||
# MAGIC Notebooks demonstrates the usage of dq-suite-amsterdam. | ||
# MAGIC | ||
# MAGIC **Requirements** to make this run: | ||
# MAGIC - Create and start a personal compute | ||
# MAGIC - Create the output tables using this script: https://github.com/Amsterdam/dq-suite-amsterdam/blob/main/scripts/data_quality_tables.sql | ||
# MAGIC - Uploaded data quality rules are defined in here `dq_workshop.json` | ||
# MAGIC | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
# Install dq-suite | ||
%pip install dq-suite-amsterdam==0.11.10 | ||
|
||
|
||
# COMMAND ---------- | ||
|
||
dbutils.library.restartPython() | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC -- explore the data | ||
# MAGIC SELECT * FROM samples.nyctaxi.trips LIMIT 10 | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
df = spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1000") | ||
# MAGIC | ||
# MAGIC %md | ||
# MAGIC ## Run dq-suite validation | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
# path to json script | ||
# -----replace with your own path | ||
dq_rule_json_path = "/Workspace/Users/[email protected]/dq_workshop.json" | ||
|
||
|
||
# COMMAND ---------- | ||
|
||
from dq_suite.validation import run_validation | ||
# load data into a pyspark dataframe | ||
df = spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1000") | ||
df.summary().show() | ||
|
||
|
||
# COMMAND ---------- | ||
|
||
from dq_suite.validation import run_validation | ||
|
||
# run the dataframe validation given the rules defined in dq_rule_json_path | ||
run_validation( | ||
json_path=dq_rule_json_path, | ||
df=df, | ||
spark_session=spark, | ||
catalog_name="dpxx_dev", | ||
catalog_name="dpxx_dev", # -----replace with your own catalog name | ||
table_name="nyc_taxi", | ||
validation_name="dq_workshop", | ||
) | ||
|
||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC | ||
# MAGIC %md | ||
# MAGIC ## Explore output | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC -- check regel van json defined in dq_rule_json_path | ||
# MAGIC SELECT * FROM dpxx_dev.data_quality.regel | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC %sql | ||
# MAGIC -- check validatie output van run_validation() | ||
# MAGIC SELECT * FROM dpxx_dev.data_quality.validatie | ||
# MAGIC | ||
|
||
# COMMAND ---------- | ||
|
||
# MAGIC | ||
# MAGIC %md | ||
# MAGIC ## Exercise | ||
# MAGIC - Check which rules could be adjusted in dq_workshop.json file to make it work with the data | ||
# MAGIC - Add an additional rule to be checked | ||
# MAGIC - Check how the results are updated after multiple runs |