Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
317 commits
Select commit Hold shift + click to select a range
f069cce
lint
dlawin Feb 2, 2023
66d451c
databricks support 3 part id
dlawin Feb 2, 2023
0117d8b
Improved documentation
erezsh Feb 2, 2023
7e65475
Merge pull request #17 from datafold/dev
erezsh Feb 2, 2023
f415f5a
Merge pull request #13 from datafold/redshift_3_part_id
erezsh Feb 6, 2023
2fe17a1
Merge pull request #16 from datafold/databricks_3_part_id
erezsh Feb 6, 2023
c290356
[WIP] Compare jsons as dicts, add json type to tests
nicolasaldecoa Feb 7, 2023
64eb985
Merge branch 'datafold:master' into test-sqeleton-pr15
nicolasaldecoa Feb 7, 2023
bc0f757
add coltype aware matching and warning
nicolasaldecoa Feb 8, 2023
85ded0e
changes related to https://github.com/datafold/data-diff/pull/383
nicolasaldecoa Feb 8, 2023
53f804a
Support connections to DuckDB for `data-diff --dbt` (#1)
dbeatty10 Feb 11, 2023
ca3ec72
optimization hint support for oracle and mysql
RoderickJDunn Feb 13, 2023
5092994
unit tests
RoderickJDunn Feb 13, 2023
4f77c64
Fixed unit test
RoderickJDunn Feb 13, 2023
beba986
Fixed NotImplimentedError for other DBs
RoderickJDunn Feb 13, 2023
21f2537
Base optimizer_hints throws NotImplementedError
RoderickJDunn Feb 13, 2023
7457caa
Using mixin for optimizer_hints support
RoderickJDunn Feb 13, 2023
04ab11b
squash add pg 3 part id support
dlawin Feb 23, 2023
446c8e9
Use the same `select_table_schema` and `_normalize_table_path` as Red…
dbeatty10 Feb 24, 2023
d75cb4b
Use the same `select_table_schema` and `_normalize_table_path` as Red…
dbeatty10 Feb 24, 2023
62a2605
Small improvement to tests
erezsh Feb 24, 2023
6c94e17
Fix spelling error
erezsh Feb 24, 2023
201c4e9
Ran black
erezsh Feb 24, 2023
40159a3
Merge pull request #18 from datafold/postgres_3_part_id
erezsh Feb 24, 2023
b593284
Merge pull request #23 from datafold/RoderickJDunn-optimizer-hints-v1
erezsh Feb 24, 2023
5855132
Align with implementation for Postgres
dbeatty10 Feb 24, 2023
ec13b5f
Merge branch 'datafold:master' into master
dbeatty10 Feb 24, 2023
8f30705
Merge branch 'master' into dbeatty10/duckdb-information-schema
dbeatty10 Feb 24, 2023
9d99ceb
Test 3-part fully-qualified names (FQN) for DuckDB
dbeatty10 Feb 24, 2023
062d5cb
Bump to latest version of DuckDB
dbeatty10 Feb 24, 2023
f51e036
poetry lock --no-update
dlawin Feb 24, 2023
e5e51e3
Merge pull request #24 from dbeatty10/dbeatty10/duckdb-information-sc…
erezsh Feb 27, 2023
9e9e276
rebuild poetry.lock
erezsh Mar 2, 2023
2e6c735
Merge pull request #25 from datafold/new_lock
erezsh Mar 2, 2023
41ac680
Version bump (0.0.6)
erezsh Mar 2, 2023
1c736ad
postgres override select_table_unique_columns
dlawin Mar 2, 2023
793ab39
Merge pull request #26 from datafold/postgres_select_table_unique_col…
williebsweet Mar 2, 2023
3815892
Version bump (0.0.7)
erezsh Mar 3, 2023
e5b34f3
Convert timezone
bjoernhaeuser Feb 15, 2023
4c3bb93
Support querying view schema on Redshift
RoderickJDunn Mar 5, 2023
6a61c06
Support querying view schema on Redshift
RoderickJDunn Mar 5, 2023
8471d78
Added missing DATE type to Oracle
RoderickJDunn Mar 7, 2023
5a923f8
Fix merge issue
RoderickJDunn Mar 7, 2023
cda5180
Added infrastructure to support PR #20 (TimestampTZ repr)
erezsh Mar 10, 2023
44cf693
Fixes for redshift and presto
erezsh Mar 10, 2023
04018a1
Fix for presto
erezsh Mar 10, 2023
11e5dca
Merge pull request #30 from datafold/bjoernhaeuser-patch-1
erezsh Mar 10, 2023
ea8ab20
Add snowflake date mapping
bjoernhaeuser Mar 10, 2023
6c53273
Merge pull request #31 from bjoernhaeuser/snowflake-date
erezsh Mar 10, 2023
5db9130
adds a basic e2e test for --dbt using duckdb
dlawin Mar 15, 2023
c180cb1
Align PK support with Datafold SaaS (#446)
dlawin Mar 22, 2023
19657fa
Merge branch 'master' into issue_418_duckdb
dlawin Mar 23, 2023
6c81637
update test to reflect new output
dlawin Mar 23, 2023
7386009
add assert conditions
dlawin Mar 23, 2023
a417702
dbt-uuid (#455)
kylemcnair Mar 24, 2023
22f0a92
Merge branch 'master' into issue_418_duckdb
dlawin Mar 27, 2023
f406c0a
Merge pull request #443 from dlawin/issue_418_duckdb
dlawin Mar 27, 2023
946eb48
cloud api token flow
pik94 Mar 24, 2023
802d0d3
refactor with black
pik94 Mar 28, 2023
778626c
fix unit tests
pik94 Mar 28, 2023
e6d4231
reduce repetition in print statements
dlawin Mar 28, 2023
831c7c1
increment dbt supported version (#463)
dlawin Mar 28, 2023
e711b97
remove an extra TODO
pik94 Mar 30, 2023
b9cae0f
Merge pull request #462 from pik94/datafold-cloud-api-token-flow
pik94 Mar 30, 2023
4874f45
expand --cloud output by polling for results
dlawin Mar 31, 2023
97578f1
Optional
dlawin Mar 31, 2023
197f988
print the paths when there's an error
dlawin Mar 31, 2023
f7d5d8c
print url in error if we have diff_id
dlawin Mar 31, 2023
992651d
Merge pull request #467 from dlawin/issue_460
dlawin Mar 31, 2023
aa8c65a
add basic key/pair support
dlawin Mar 31, 2023
c4b4b68
snowflake: add support for key password
dlawin Mar 31, 2023
06c9676
snowflake add support for private_key_passphrase
dlawin Mar 31, 2023
dadda1b
Merge pull request #468 from dlawin/issue_442
dlawin Mar 31, 2023
8cf7da3
Merge branch 'master' into issue_442_2
dlawin Mar 31, 2023
8c5e98c
Merge pull request #34 from datafold/data_diff_issue_442
williebsweet Mar 31, 2023
10159ef
Various changes -
erezsh Apr 2, 2023
417d8f4
Merge pull request #35 from datafold/apr1
erezsh Apr 2, 2023
a5322ca
Version bump (0.0.8)
Apr 3, 2023
4d0f506
increase sqeleton to 0.0.8
dlawin Apr 3, 2023
9cb0e1d
Merge pull request #469 from dlawin/issue_442_2
dlawin Apr 3, 2023
71755f4
increment data-diff version
dlawin Apr 3, 2023
e09a45b
Merge pull request #472 from dlawin/data_diff_0_6_0
dlawin Apr 3, 2023
72738ee
continue on None or empty list
dlawin Apr 4, 2023
300b8c9
Merge branch 'master' into issue_476
dlawin Apr 4, 2023
c6cd6fe
raise when project vars not found
dlawin Apr 4, 2023
c185f50
Add support for smallint to Postgres
MattDelac Apr 4, 2023
bfef4d0
Merge pull request #478 from dlawin/issue_474
dlawin Apr 4, 2023
693f5b2
Merge pull request #37 from MattDelac/support_smallint_for_redshift
erezsh Apr 4, 2023
3075304
allow pass: key for redshift
dlawin Apr 5, 2023
6e87b89
add tests to reproduce the issue
vvkh Apr 5, 2023
d0bd81e
fix FLOAT type conversion in presto
vvkh Apr 5, 2023
c3688b2
Merge pull request #38 from datafold/fix-type-conversion-in-presto
vvkh Apr 6, 2023
7fa4180
dbt tracking
kylemcnair Apr 6, 2023
0946043
Merge pull request #488 from kylemcnair/dbt-tracking
dlawin Apr 6, 2023
92b1782
Merge branch 'master' into refactor_dbt_output_strings
dlawin Apr 6, 2023
c2b8e92
post merge changes
dlawin Apr 6, 2023
686ee7c
formatter
dlawin Apr 6, 2023
b8ae778
Merge pull request #464 from dlawin/refactor_dbt_output_strings
dlawin Apr 6, 2023
9cec3d4
Merge branch 'master' into issue_481
dlawin Apr 6, 2023
d6cf458
add a class to interact with the datafold cloud api
pik94 Mar 28, 2023
4b5ff5f
add a flow to get or create data source
pik94 Mar 28, 2023
bc3ddb9
run tests for a new created data sources
pik94 Mar 29, 2023
7212021
fix unit tests
pik94 Mar 29, 2023
fc718b1
add some unit tests for creating of data source configs and data sour…
pik94 Mar 29, 2023
641b1a5
return test results instead of a flag for _test_data_source function
pik94 Mar 29, 2023
520f5a4
add unit tests for data source tests
pik94 Mar 30, 2023
c219b1e
fix an incorrect float_tolerance value
pik94 Mar 30, 2023
3998f94
add a function to render available data sources
pik94 Mar 30, 2023
356ddaf
refactor
pik94 Mar 30, 2023
e0a7616
improve logging for cloud CLI
pik94 Mar 31, 2023
78324e6
disable profiling and lineage for cloud data sources
pik94 Mar 31, 2023
d9f85c4
fix unit tests
pik94 Mar 31, 2023
abc4b73
update a link
pik94 Mar 31, 2023
c5a6cd4
update after rebasing to the latest master
pik94 Apr 5, 2023
27b2472
update unit tests
pik94 Apr 5, 2023
4b551d3
reformat using black
pik94 Apr 5, 2023
64d62ae
remove unused function
pik94 Apr 5, 2023
4183459
fix typos and validate fields without default values
pik94 Apr 6, 2023
679014b
add a unit test for validating a value of a required field
pik94 Apr 6, 2023
f07d1c8
fix a typo
pik94 Apr 6, 2023
e3aa793
set required parameters for data sources in pydantic
pik94 Apr 6, 2023
fd94a7b
fix unit tests
pik94 Apr 6, 2023
03220e8
use monotonic time and improve error messages
pik94 Apr 6, 2023
67b8b70
replace internal endpoints to external
pik94 Apr 6, 2023
552b06b
apply black formatter
pik94 Apr 7, 2023
0e4d6d4
Merge pull request #477 from dlawin/issue_476
dlawin Apr 7, 2023
25bcb82
Issue 489: do not infer source pks from tests (#490)
dlawin Apr 7, 2023
9fd3ecc
add snowflake sso (#487)
dlawin Apr 7, 2023
1377eb4
use an external endpoint for getting a data source list
pik94 Apr 11, 2023
8d1fb40
Merge pull request #466 from pik94/datafold-database-credentials-from…
pik94 Apr 11, 2023
f0e2758
save and get the api key from the system keyring service
pik94 Apr 5, 2023
6596d27
update poetry toml and lock files
pik94 Apr 11, 2023
1b214d6
limit a version of dsnparse because the 0.2.0 version does not work i…
pik94 Apr 11, 2023
deb324b
use "data-diff" as a service name for creds storage instead of "system"
pik94 Apr 11, 2023
44c0928
Merge pull request #482 from dlawin/issue_481
dlawin Apr 11, 2023
fef3939
fix up links in readme
leoebfolsom Apr 11, 2023
73af968
update console messages
pik94 Apr 12, 2023
0314423
add softer version limits for the dsnparse package
pik94 Apr 12, 2023
acd1643
Merge pull request #492 from pik94/ilia-dx-565-store-the-api-key-in-t…
pik94 Apr 12, 2023
24abee4
provide an opportunity to parse db credentials from dbt profiles.yml
pik94 Apr 10, 2023
a1f7792
add unit tests for parsing database credentials from dbt profiles.yml
pik94 Apr 10, 2023
8b7280c
fix typos
pik94 Apr 10, 2023
df4a5e6
fix a typo in unit tests
pik94 Apr 11, 2023
dd2330f
add type hints
pik94 Apr 11, 2023
e391403
update after rebasing to the latest master
pik94 Apr 12, 2023
4cf56db
Merge pull request #491 from pik94/ilia-dx-564-grab-credentials-from-…
pik94 Apr 12, 2023
3d19330
Create usage_analytics.md
williebsweet Apr 12, 2023
692c990
remove wording that applies only to hashdiff
williebsweet Apr 12, 2023
729f676
update links
williebsweet Apr 12, 2023
eecf119
Create common_use_cases.md
williebsweet Apr 12, 2023
19fc965
Merge pull request #493 from leoebfolsom/fix-readme-link
leoebfolsom Apr 12, 2023
af5ffe2
support insecure_mode key for snowflake profile
dlawin Apr 12, 2023
8026d0b
cleaner rows added/removed
dlawin Apr 13, 2023
33d2c5d
fix test
dlawin Apr 13, 2023
f71c6c5
revert project_path change
dlawin Apr 13, 2023
68a9d72
Merge pull request #497 from dlawin/tabulate_for_rows_added_removed
dlawin Apr 13, 2023
38c213a
simplify
dlawin Apr 13, 2023
ed971de
Merge pull request #496 from dlawin/issue_494
dlawin Apr 13, 2023
25692cb
Tidying up duplication between /docs and docs.datafold.com (#495)
leoebfolsom Apr 13, 2023
bf3ba0e
handle all custom schemas scenarios
dlawin Apr 14, 2023
716d3a3
format
dlawin Apr 14, 2023
c8f0c5b
test new custom schemas logic
dlawin Apr 14, 2023
9d23259
add a warning when the deprecated variable exists
dlawin Apr 14, 2023
b018eb2
link to doc in prod_custom_schema error
dlawin Apr 14, 2023
5632150
Merge pull request #498 from dlawin/issue_447
dlawin Apr 14, 2023
204a863
increment version
dlawin Apr 14, 2023
b047c27
Merge pull request #500 from dlawin/0_7_0
dlawin Apr 14, 2023
fe243d1
issue 501: instantiate dbt variables
dlawin Apr 15, 2023
85b6441
Merge pull request #502 from dlawin/issue_501
dlawin Apr 15, 2023
7ccd09b
increment version
dlawin Apr 15, 2023
8777ca2
Merge pull request #504 from dlawin/0_7_1
dlawin Apr 15, 2023
ee40c13
issue 505: add client_session_keep_alive (#506)
dlawin Apr 17, 2023
7a5769d
increment version
dlawin Apr 18, 2023
2a85528
provide a default value for temporary schema
pik94 Apr 19, 2023
0b1b54e
add unit tests for a temp schema
pik94 Apr 19, 2023
6b14972
Add logs to indicate current artifact being parsed in debug mode
MalanB Apr 19, 2023
3c0ec0f
align dbt cred params with datafold cred params
pik94 Apr 19, 2023
3d9a079
update unit tests
pik94 Apr 19, 2023
8e073e8
apply black
pik94 Apr 19, 2023
71522ee
working code to allow --select
dave-connors-3 Apr 19, 2023
1eacf7f
allow -s shorthand
dave-connors-3 Apr 19, 2023
5890b57
add a message for unsupported bigquery dbt auth methods
pik94 Apr 20, 2023
efcb5b5
Embed sqeleton into data-diff for synchronous changes & releases (wit…
Apr 20, 2023
c87d93c
Shift sqeleton deep inside of data-diff
Apr 20, 2023
50bac04
Merge pull request #485 from datafold/embed-sqeleton
nolar Apr 20, 2023
5de4850
Prepare the repo for archival
Apr 20, 2023
02e2089
Merge pull request #28 from RoderickJDunn/oracle_add_date_type
Apr 20, 2023
78d07f1
Merge pull request #27 from RoderickJDunn/redshift-views
Apr 20, 2023
2284491
make cloud diffs more responsive
dlawin Apr 19, 2023
e699026
shorten max sleep interval, add debug log
dlawin Apr 19, 2023
4a20f0c
fix print call count assertions
dlawin Apr 19, 2023
bbb406f
Merge pull request #509 from pik94/make-temp-schema-optional
pik94 Apr 20, 2023
a1282b7
warn when using recent dbt-core version
dlawin Apr 20, 2023
46f3371
remove upper bound from exception
dlawin Apr 20, 2023
4c311cf
Merge pull request #517 from dlawin/issue_516
dlawin Apr 20, 2023
c3f61c3
remove duplicated consts and methods
dlawin Apr 20, 2023
a5cd84d
Merge pull request #508 from MalanB/log-parser-artifacts
dlawin Apr 20, 2023
ef351cc
issue_518 add meta filter support
dlawin Apr 21, 2023
e7f75a9
fix tests
dlawin Apr 21, 2023
fdf0071
Merge pull request #522 from dlawin/issue_518_1
dlawin Apr 21, 2023
dd3740f
v0.7.3
dlawin Apr 21, 2023
ca9cd05
Merge pull request #523 from dlawin/v0_7_3
dlawin Apr 21, 2023
2f3e83d
Merge pull request #521 from dlawin/remove_duplicated_dbt_code
dlawin Apr 21, 2023
ef2e919
add a dock link when vars do not exist
dlawin Apr 25, 2023
d3f8dc1
use a single error message
dlawin Apr 25, 2023
d3365ef
bump version to 0.7.4
dlawin Apr 25, 2023
cfd941f
Merge pull request #528 from dlawin/guidance_when_no_dbt_vars
dlawin Apr 25, 2023
5db828c
Merge branch 'master' into allow-dbt-selectors
dlawin Apr 28, 2023
33e52bd
finalize merge
dlawin Apr 28, 2023
4edbbd0
handle more exception scenarios, style, .lock
dlawin Apr 28, 2023
6704ac8
typo
dlawin Apr 28, 2023
6739cfe
fix merge master mistake
dlawin Apr 28, 2023
c88b297
add new get_models tests, rename previous
dlawin Apr 28, 2023
7d29542
handle unexpected edge case
dlawin Apr 29, 2023
3fb5af2
Merge branch 'master' into redshift-super-support
May 2, 2023
4fc9d36
Merge pull request #15 from nicolasaldecoa/redshift-super-support
May 2, 2023
53f01c7
Follow-up with remaining changes in Sqeleton-to-DataDiff embedding
May 2, 2023
c3d0a84
focus readme on the dbt use case
leoebfolsom May 4, 2023
7a01ce9
wordsmithing and centering the gif
leoebfolsom May 5, 2023
6b3d000
Merge pull request #551 from leoebfolsom/update-readme
kylemcnair May 5, 2023
cf9e454
center the header
leoebfolsom May 5, 2023
33bf6a9
Merge pull request #552 from leoebfolsom/readme-cosmetic
kylemcnair May 5, 2023
14193b9
Merge pull request #543 from datafold/followup-sqeleton
nolar May 5, 2023
39445bf
Merge branch 'master' into test-sqeleton-pr15
nolar May 5, 2023
746b72e
Merge pull request #383 from nicolasaldecoa/test-sqeleton-pr15
nolar May 5, 2023
e0aafc0
Skip JSON/JSONB cross-db tests when the db does not have this type
May 5, 2023
72d4091
Avoid indexing by columns that do not support indexing (JSON/JSONB)
May 5, 2023
6cdd0a6
Expect some downloaded rows for fuzzily diffed column types
May 5, 2023
fbdb101
Escape and serialise the Postgres JSON/JSONB values in tests
May 5, 2023
6cf709b
Merge pull request #553 from datafold/fix-pg-json-tests
nolar May 5, 2023
d8a072f
correct early return not including the set object
nicolasaldecoa May 5, 2023
d10bf39
improve version check
dlawin May 5, 2023
ce52e8b
Merge pull request #511 from dave-connors-3/allow-dbt-selectors
dlawin May 5, 2023
3c18507
Merge pull request #554 from nicolasaldecoa/hotfix/diffs-are-equiv-jsons
nolar May 5, 2023
f9bf8a7
Rename JSONType → Type for brevity
May 3, 2023
f7c1b9d
Eliminate unused specification of JSON column types
May 3, 2023
46c8867
Convert JSON column type to a dataclass as all other column types
May 3, 2023
d198647
Merge pull request #545 from datafold/simplify-json-columns
nolar May 5, 2023
7c8d058
Compare JSON, ARRAY, STRUCT types in BigQuery (simplistically)
Apr 28, 2023
6fd0fbd
instantiate MACRO_DEBUGGING global for dbt
dlawin May 5, 2023
40e24e0
Merge pull request #533 from datafold/compare-bigquery-arrays-and-str…
nolar May 5, 2023
c80edac
cleanup
dlawin May 5, 2023
eef79ab
format
dlawin May 5, 2023
c646505
if dbt ls throws an exception, raise that first
dlawin May 5, 2023
7f5d9e8
Merge pull request #555 from dlawin/macro_debugging_error
dlawin May 5, 2023
796c5aa
Merge pull request #556 from dlawin/reorder_select_error_handling
dlawin May 5, 2023
1b75c49
increment version
dlawin May 5, 2023
eee180d
Merge pull request #557 from dlawin/v0_7_5
dlawin May 5, 2023
6f633e2
add database-specific installation instructions to readme
leoebfolsom May 6, 2023
ea68fb5
Merge pull request #559 from leoebfolsom/update-readme-with-db-specif…
leoebfolsom May 6, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ docs/_build/
# PyBuilder
target/

# Exception for dbt tests
!tests/dbt_artifacts/target

# Jupyter Notebook
.ipynb_checkpoints

Expand Down
174 changes: 61 additions & 113 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,160 +2,108 @@
<img alt="Datafold" src="https://user-images.githubusercontent.com/1799931/196497110-d3de1113-a97f-4322-b531-026d859b867a.png" width="50%" />
</p>

# **data-diff**
<h1 align="center">
data-diff
</h1>

<h2 align="center">
Develop dbt models faster by testing as you code.
</h2>
<h4 align="center">
See how every change to dbt code affects the data produced in the modified model and downstream.
</h4>
<br>

## What is `data-diff`?
data-diff is a **free, open-source tool** that enables data professionals to detect differences in values between any two tables. It's fast, easy to use, and reliable. Even at massive scale.

## Documentation
data-diff is an open source package that you can use to see the impact of your dbt code changes on your dbt models as you code.

[**🗎 Documentation website**](https://docs.datafold.com/os_diff/about) - our detailed documentation has everything you need to start diffing.
<div align="center">

### Databases we support
![development_testing_gif](https://user-images.githubusercontent.com/1799931/236354286-d1d044cf-2168-4128-8a21-8c8ca7fd494c.gif)

- PostgreSQL >=10
- MySQL
- Snowflake
- BigQuery
- Redshift
- Oracle
- Presto
- Databricks
- Trino
- Clickhouse
- Vertica
- DuckDB >=0.6
- SQLite (coming soon)
</div>

For their corresponding connection strings, check out our [detailed table](https://docs.datafold.com/os_diff/databases_we_support).
<br>

#### Looking for a database not on the list?
If a database is not on the list, we'd still love to support it. [Please open an issue](https://github.com/datafold/data-diff/issues) to discuss it, or vote on existing requests to push them up our todo list.
## Getting Started

## Use cases
**Install `data-diff`**

### Diff Tables Between Databases
#### Quickly identify issues when moving data between databases

<p align="center">
<img alt="diff2" src="https://user-images.githubusercontent.com/1799931/196754998-a88c0a52-8751-443d-b052-26c03d99d9e5.png" />
</p>

### Diff Tables Within a Database
#### Improve code reviews by identifying data problems you don't have tests for
<p align="center">
<a href=https://www.loom.com/share/682e4b7d74e84eb4824b983311f0a3b2 target="_blank">
<img alt="Intro to Diff" src="https://user-images.githubusercontent.com/1799931/196576582-d3535395-12ef-40fd-bbbb-e205ccae1159.png" width="50%" height="50%" />
</a>
</p>

&nbsp;
&nbsp;

## Get started

### Installation

#### First, install `data-diff` using `pip`.
Install `data-diff` with the command that is specific to the database you use with dbt.

### Snowflake
```
pip install data-diff
pip install data-diff 'data-diff[snowflake,dbt]' -U
```

#### Then, install one or more driver(s) specific to the database(s) you want to connect to.

- `pip install 'data-diff[mysql]'`

- `pip install 'data-diff[postgresql]'`

- `pip install 'data-diff[snowflake]'`

- `pip install 'data-diff[presto]'`

- `pip install 'data-diff[oracle]'`

- `pip install 'data-diff[trino]'`

- `pip install 'data-diff[clickhouse]'`

- `pip install 'data-diff[vertica]'`

- For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/

_Some drivers have dependencies that cannot be installed using `pip` and still need to be installed manually._

### Run your first diff

Once you've installed `data-diff`, you can run it from the command line.

### BigQuery
```
data-diff DB1_URI TABLE1_NAME DB2_URI TABLE2_NAME [OPTIONS]
pip install data-diff 'data-diff[dbt]' google-cloud-bigquery -U
```

Be sure to read [the docs](https://docs.datafold.com/os_diff/how_to_use/how_to_use_with_command_line) for detailed instructions how to build one of these commands depending on your database setup.

#### Code Example: Diff Tables Between Databases
Here's an example command for your copy/pasting, taken from the screenshot above when we diffed data between Snowflake and Postgres.
### Redshift
```
pip install data-diff 'data-diff[redshift,dbt]' -U
```

### Postgres
```
data-diff \
postgresql://<username>:'<password>'@localhost:5432/<database> \
<table> \
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
<TABLE> \
-k activity_id \
-c activity \
-w "event_timestamp < '2022-10-10'"
pip install data-diff 'data-diff[postgres,dbt]' -U
```

#### Code Example: Diff Tables Within a Database
### Databricks
```
pip install data-diff 'data-diff[databricks,dbt]' -U
```

Here's a code example from [the video](https://www.loom.com/share/682e4b7d74e84eb4824b983311f0a3b2), where we compare data between two Snowflake tables within one database.
### DuckDB
```
pip install data-diff 'data-diff[duckdb,dbt]' -U
```

**Update a few lines in your `dbt_project.yml`**.
```
data-diff \
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA_1>?warehouse=<WAREHOUSE>&role=<ROLE>" <TABLE_1> \
<SCHEMA_2>.<TABLE_2> \
-k org_id \
-c created_at -c is_internal \
-w "org_id != 1 and org_id < 2000" \
-m test_results_%t \
--materialize-all-rows \
--table-write-limit 10000
#dbt_project.yml
vars:
data_diff:
prod_database: my_database
prod_schema: my_default_schema
```

In both code examples, I've used `<>` carrots to represent values that **should be replaced with your values** in the database connection strings. For the flags (`-k`, `-c`, etc.), I opted for "real" values (`org_id`, `is_internal`) to give you a more realistic view of what your command will look like.
**Run your first data diff!**

### We're here to help!
```
dbt run && data-diff --dbt
```

We know that in some cases, the data-diff command can become long and dense. And maybe you're new to the command line.
We recommend you get started by walking through [our simple setup instructions](https://docs.datafold.com/development_testing/open_source) which contain examples and details.

* We're here to help [on slack](https://locallyoptimistic.slack.com/archives/C03HUNGQV0S) if you have ANY questions as you use `data-diff` in your workflow.
* You can also post a question in [GitHub Discussions](https://github.com/datafold/data-diff/discussions).
Please reach out on the dbt Slack in [#tools-datafold](https://getdbt.slack.com/archives/C03D25A92UU) if you have any trouble whatsoever getting started!

<br><br>

To get a Slack invite - [click here](https://locallyoptimistic.com/community/)
### Diffing between databases

## How to Use
Check out our [documentation](https://github.com/datafold/data-diff/blob/master/docs/supported-databases.md) if you're looking to compare data across databases (for example, between Postgres and Snowflake).

* [How to use from the shell (or: command-line)](https://docs.datafold.com/os_diff/how_to_use/how_to_use_with_command_line)
* [How to use from Python](https://docs.datafold.com/os_diff/how_to_use/how_to_use_with_python)
* [How to use with TOML configuration file](https://docs.datafold.com/os_diff/how_to_use/how_to_use_with_toml)
* [Usage Analytics & Data Privacy](https://docs.datafold.com/os_diff/usage_analytics_data_privacy)
<br>

## How to Contribute
* Feel free to open an issue or contribute to the project by working on an existing issue.
* Please read the [contributing guidelines](https://github.com/datafold/data-diff/blob/master/CONTRIBUTING.md) to get started.
## Contributors

Big thanks to everyone who contributed so far:
We thank everyone who contributed so far!

<a href="https://github.com/datafold/data-diff/graphs/contributors">
<img src="https://contributors-img.web.app/image?repo=datafold/data-diff" />
</a>

## Technical Explanation
<br>

## Analytics

* [Usage Analytics & Data Privacy](https://github.com/datafold/data-diff/blob/master/docs/usage_analytics.md)

Check out this [technical explanation](https://docs.datafold.com/os_diff/technical_explanation) of how data-diff works.
<br>

## License

Expand Down
2 changes: 1 addition & 1 deletion data_diff/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Sequence, Tuple, Iterator, Optional, Union

from sqeleton.abcs import DbTime, DbPath
from data_diff.sqeleton.abcs import DbTime, DbPath

from .tracking import disable_tracking
from .databases import connect
Expand Down
13 changes: 11 additions & 2 deletions data_diff/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
import rich
import click

from sqeleton.schema import create_schema
from sqeleton.queries.api import current_timestamp
from data_diff.sqeleton.schema import create_schema
from data_diff.sqeleton.queries.api import current_timestamp

from .dbt import dbt_diff
from .utils import eval_name_template, remove_password_from_url, safezip, match_like
Expand Down Expand Up @@ -228,6 +228,13 @@ def write_usage(self, prog: str, args: str = "", prefix: Optional[str] = None) -
metavar="PATH",
help="Which directory to look in for the dbt_project.yml file. Default is the current working directory and its parents.",
)
@click.option(
"--select",
"-s",
default=None,
metavar="PATH",
help="select dbt resources to compare using dbt selection syntax",
)
def main(conf, run, **kw):
if kw["table2"] is None and kw["database2"]:
# Use the "database table table" form
Expand Down Expand Up @@ -264,6 +271,7 @@ def main(conf, run, **kw):
profiles_dir_override=kw["dbt_profiles_dir"],
project_dir_override=kw["dbt_project_dir"],
is_cloud=kw["cloud"],
dbt_selection=kw["select"],
)
else:
return _data_diff(**kw)
Expand Down Expand Up @@ -306,6 +314,7 @@ def _data_diff(
cloud,
dbt_profiles_dir,
dbt_project_dir,
select,
threads1=None,
threads2=None,
__conf__=None,
Expand Down
2 changes: 2 additions & 0 deletions data_diff/cloud/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .datafold_api import DatafoldAPI, TCloudApiDataDiff
from .data_source import get_or_create_data_source
Loading