|
44 | 44 | # MAGIC summary_stats_file: profile_summary_stats.yml
|
45 | 45 | # MAGIC warehouse_id: your-warehouse-id
|
46 | 46 | # MAGIC ```
|
| 47 | +# MAGIC |
| 48 | +# MAGIC If you install DQX using custom installation path you must update `custom_install_path` variable below. Installation using custom path is required when using [group assigned cluster](https://docs.databricks.com/aws/en/compute/group-access)! |
| 49 | + |
| 50 | +# COMMAND ---------- |
| 51 | + |
| 52 | +# Updated the installation path if you install DQX in a custom folder! |
| 53 | +custom_install_path: str = "" |
| 54 | +dbutils.widgets.text("dqx_custom_installation_path", custom_install_path, "DQX Custom Installation Path") |
47 | 55 |
|
48 | 56 | # COMMAND ----------
|
49 | 57 |
|
|
107 | 115 | import glob
|
108 | 116 | import os
|
109 | 117 |
|
110 |
| -user_name = spark.sql("select current_user() as user").collect()[0]["user"] |
111 |
| -default_dqx_installation_path = f"/Workspace/Users/{user_name}/.dqx" |
| 118 | +if custom_install_path: |
| 119 | + default_dqx_installation_path = custom_install_path |
| 120 | + print(f"Using custom installation path: {custom_install_path}") |
| 121 | +else: |
| 122 | + user_name = spark.sql("select current_user() as user").collect()[0]["user"] |
| 123 | + default_dqx_installation_path = f"/Workspace/Users/{user_name}/.dqx" |
| 124 | + print(f"Using default user's home installation path: {default_dqx_installation_path}") |
| 125 | + |
112 | 126 | default_dqx_product_name = "dqx"
|
113 | 127 |
|
114 | 128 | dbutils.widgets.text("dqx_installation_path", default_dqx_installation_path, "DQX Installation Folder")
|
115 | 129 | dbutils.widgets.text("dqx_product_name", default_dqx_product_name, "DQX Product Name")
|
116 | 130 |
|
117 | 131 | dqx_wheel_files_path = f"{dbutils.widgets.get('dqx_installation_path')}/wheels/databricks_labs_dqx-*.whl"
|
118 | 132 | dqx_wheel_files = glob.glob(dqx_wheel_files_path)
|
| 133 | + |
119 | 134 | try:
|
120 | 135 | dqx_latest_wheel = max(dqx_wheel_files, key=os.path.getctime)
|
121 | 136 | except:
|
|
126 | 141 |
|
127 | 142 | # COMMAND ----------
|
128 | 143 |
|
| 144 | +custom_install_path = dbutils.widgets.get('dqx_custom_installation_path') or None |
| 145 | + |
| 146 | +# COMMAND ---------- |
| 147 | + |
129 | 148 | # MAGIC %md
|
130 | 149 | # MAGIC ### Run profiler workflow to generate quality rule candidates
|
131 | 150 | # MAGIC
|
|
162 | 181 | dq_engine = DQEngine(ws)
|
163 | 182 |
|
164 | 183 | # load the run configuration
|
165 |
| -run_config = RunConfigLoader(ws).load_run_config(run_config_name="default", product_name=dqx_product_name) |
| 184 | +run_config = RunConfigLoader(ws).load_run_config( |
| 185 | + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path |
| 186 | +) |
166 | 187 |
|
167 | 188 | # read the input data, limit to 1000 rows for demo purpose
|
168 | 189 | input_df = read_input_data(spark, run_config.input_config).limit(1000)
|
|
180 | 201 | print(yaml.safe_dump(checks))
|
181 | 202 |
|
182 | 203 | # save generated checks to location specified in the default run configuration inside workspace installation folder
|
183 |
| -dq_engine.save_checks(checks, config=InstallationChecksStorageConfig(run_config_name="default", product_name=dqx_product_name)) |
| 204 | +dq_engine.save_checks(checks, config=InstallationChecksStorageConfig( |
| 205 | + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path |
| 206 | + ) |
| 207 | +) |
184 | 208 |
|
185 | 209 | # or save checks in arbitrary workspace location
|
186 | 210 | #dq_engine.save_checks(checks, config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml"))
|
|
245 | 269 | dq_engine = DQEngine(WorkspaceClient())
|
246 | 270 |
|
247 | 271 | # save checks to location specified in the default run configuration inside workspace installation folder
|
248 |
| -dq_engine.save_checks(checks, config=InstallationChecksStorageConfig(run_config_name="default", product_name=dqx_product_name)) |
| 272 | +dq_engine.save_checks(checks, config=InstallationChecksStorageConfig( |
| 273 | + run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path |
| 274 | + ) |
| 275 | +) |
249 | 276 |
|
250 | 277 | # or save checks in arbitrary workspace location
|
251 | 278 | #dq_engine.save_checks(checks, config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml"))
|
|
267 | 294 | dq_engine = DQEngine(WorkspaceClient())
|
268 | 295 |
|
269 | 296 | # load the run configuration
|
270 |
| -run_config = RunConfigLoader(ws).load_run_config(run_config_name="default", assume_user=True, product_name=dqx_product_name) |
| 297 | +run_config = RunConfigLoader(ws).load_run_config( |
| 298 | + run_config_name="default", assume_user=True, product_name=dqx_product_name, install_folder=custom_install_path |
| 299 | +) |
271 | 300 |
|
272 | 301 | # read the data, limit to 1000 rows for demo purpose
|
273 | 302 | bronze_df = read_input_data(spark, run_config.input_config).limit(1000)
|
|
276 | 305 | bronze_transformed_df = bronze_df.filter("vendor_id in (1, 2)")
|
277 | 306 |
|
278 | 307 | # load checks from location defined in the run configuration
|
279 |
| - |
280 |
| -checks = dq_engine.load_checks(config=InstallationChecksStorageConfig(assume_user=True, run_config_name="default", product_name=dqx_product_name)) |
| 308 | +checks = dq_engine.load_checks(config=InstallationChecksStorageConfig( |
| 309 | + assume_user=True, run_config_name="default", product_name=dqx_product_name, install_folder=custom_install_path |
| 310 | + ) |
| 311 | +) |
281 | 312 |
|
282 | 313 | # or load checks from arbitrary workspace file
|
283 | 314 | #checks = dq_engine.load_checks(config=WorkspaceFileChecksStorageConfig(location="/Shared/App1/checks.yml"))
|
|
0 commit comments