graph-massivizer · tinazezlin · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,79 @@
+
+##########################
+# KEDRO PROJECT
+
+# ignore all local configuration
+conf/local/**
+!conf/local/.gitkeep
+
+# ignore potentially sensitive credentials files
+conf/**/*credentials*
+
+# ignore everything in the following folders
+data/**
+
+# except their sub-folders
+!data/**/
+
+# also keep all .gitkeep files
+!.gitkeep
+
+# also keep the example dataset
+!data/01_raw/companies.csv
+!data/01_raw/reviews.csv
+!data/01_raw/shuttles.xlsx
+
+# ignore kedro-viz metadata
+.viz
+
+# ignore file based logs
+*.log
+
+##########################
+# Common files
+
+# IntelliJ
+.idea/
+*.iml
+out/
+.idea_modules/
+
+### macOS
+*.DS_Store
+.AppleDouble
+.LSOverride
+.Trashes
+
+# Vim
+*~
+.*.swo
+.*.swp
+
+# emacs
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# C extensions
+*.so
+
+### Python template
+>>>>>>> recover-changes
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 
+
 # C extensions
 *.so
 
+
 # Distribution / packaging
 .Python
 build/
@@ -21,6 +89,7 @@ sdist/
 var/
 wheels/
 share/python-wheels/
+
 *.egg-info/
 .installed.cfg
 *.egg
@@ -39,28 +108,40 @@ pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
+
 .nox/
+
+
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
+
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 
+.hypothesis/
+>>>>>>> recover-changes
+
 # Translations
 *.mo
 *.pot
 
 # Django stuff:
 *.log
+
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 
+.static_storage/
+.media/
+local_settings.py
+
 # Flask stuff:
 instance/
 .webassets-cache
@@ -72,12 +153,14 @@ instance/
 docs/_build/
 
 # PyBuilder
-.pybuilder/
+
+
 target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
 
+
 # IPython
 profile_default/
 ipython_config.py
@@ -118,6 +201,13 @@ __pypackages__/
 celerybeat-schedule
 celerybeat.pid
 
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+
 # SageMath parsed files
 *.sage.py
 
@@ -137,6 +227,7 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+
 # mkdocs documentation
 /site
 
@@ -160,3 +251,7 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+# mlflow local runs
+mlruns/*
+
diff --git a/README.md b/README.md
@@ -1 +1,101 @@
-# ts2g2-sample-project
+# ts2g2-sample-project
+
+[![Powered by Kedro](https://img.shields.io/badge/powered_by-kedro-ffc900?logo=kedro)](https://kedro.org)
+
+## Overview
+
+This is your new Kedro project with Kedro-Viz and PySpark setup, which was generated using `kedro 0.19.9`.
+
+Take a look at the [Kedro documentation](https://docs.kedro.org) to get started.
+
+## Rules and guidelines
+
+In order to get the best out of the template:
+
+* Don't remove any lines from the `.gitignore` file we provide
+* Make sure your results can be reproduced by following a [data engineering convention](https://docs.kedro.org/en/stable/faq/faq.html#what-is-data-engineering-convention)
+* Don't commit data to your repository
+* Don't commit any credentials or your local configuration to your repository. Keep all your credentials and local configuration in `conf/local/`
+
+## How to install dependencies
+
+Declare any dependencies in `requirements.txt` for `pip` installation.
+
+To install them, run:
+
+```
+pip install -r requirements.txt
+```
+
+## How to run your Kedro pipeline
+
+You can run your Kedro project with:
+
+```
+kedro run
+```
+
+## How to test your Kedro project
+
+Have a look at the files `src/tests/test_run.py` and `src/tests/pipelines/data_science/test_pipeline.py` for instructions on how to write your tests. Run the tests as follows:
+
+```
+pytest
+```
+
+To configure the coverage threshold, look at the `.coveragerc` file.
+
+## Project dependencies
+
+To see and update the dependency requirements for your project use `requirements.txt`. Install the project requirements with `pip install -r requirements.txt`.
+
+[Further information about project dependencies](https://docs.kedro.org/en/stable/kedro_project_setup/dependencies.html#project-specific-dependencies)
+
+## How to work with Kedro and notebooks
+
+> Note: Using `kedro jupyter` or `kedro ipython` to run your notebook provides these variables in scope: `catalog`, `context`, `pipelines` and `session`.
+>
+> Jupyter, JupyterLab, and IPython are already included in the project requirements by default, so once you have run `pip install -r requirements.txt` you will not need to take any extra steps before you use them.
+
+### Jupyter
+To use Jupyter notebooks in your Kedro project, you need to install Jupyter:
+
+```
+pip install jupyter
+```
+
+After installing Jupyter, you can start a local notebook server:
+
+```
+kedro jupyter notebook
+```
+
+### JupyterLab
+To use JupyterLab, you need to install it:
+
+```
+pip install jupyterlab
+```
+
+You can also start JupyterLab:
+
+```
+kedro jupyter lab
+```
+
+### IPython
+And if you want to run an IPython session:
+
+```
+kedro ipython
+```
+
+### How to ignore notebook output cells in `git`
+To automatically strip out all output cell contents before committing to `git`, you can use tools like [`nbstripout`](https://github.com/kynan/nbstripout). For example, you can add a hook in `.git/config` with `nbstripout --install`. This will run `nbstripout` before anything is committed to `git`.
+
+> *Note:* Your output cells will be retained locally.
+
+## Package your Kedro project
+
+[Further information about building project documentation and packaging your project](https://docs.kedro.org/en/stable/tutorial/package_a_project.html)
+>>>>>>> recover-changes
diff --git a/conf/README.md b/conf/README.md
@@ -0,0 +1,20 @@
+# What is this for?
+
+This folder should be used to store configuration files used by Kedro or by separate tools.
+
+This file can be used to provide users with instructions for how to reproduce local configuration with their own credentials. You can edit the file however you like, but you may wish to retain the information below and add your own section in the section titled **Instructions**.
+
+## Local configuration
+
+The `local` folder should be used for configuration that is either user-specific (e.g. IDE configuration) or protected (e.g. security keys).
+
+> *Note:* Please do not check in any local configuration to version control.
+
+## Base configuration
+
+The `base` folder is for shared configuration, such as non-sensitive and project-related configuration that may be shared across team members.
+
+WARNING: Please do not put access credentials in the base configuration folder.
+
+## Find out more
+You can find out more about configuration from the [user guide documentation](https://docs.kedro.org/en/stable/configuration/configuration_basics.html).
diff --git a/conf/base/catalog.yml b/conf/base/catalog.yml
@@ -0,0 +1,63 @@
+amazon:
+  filepath: data/01_raw/AMZN.csv
+  type: pandas.CSVDataset
+
+# Intermediate datasets: Adding UUID, creating vectors, and adding different graphs
+
+# Data with UUID column added
+amazon_data_with_uuid:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_uuid.csv 
+
+# Data with vectors created (intermediate transformation result)
+amazon_data_with_vectors:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_vectors.csv 
+
+# Data with visibility graph
+amazon_data_with_visibility_graph:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_visibility_graph.csv 
+
+# Data with ordinal partition graph
+amazon_data_with_ordinal_partition_graph:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_ordinal_partition_graph.csv 
+
+# Data with quantile graph
+amazon_data_with_quantile_graph:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_quantile_graph.csv 
+# Datasets after adding UUID and random walks columns for each graph type
+
+# Data with UUID and random walks for visibility graph
+amazon_data_with_rand_walks_visibility:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_rand_walks_visibility.csv 
+
+# Data with UUID and random walks for ordinal partition graph
+amazon_data_with_rand_walks_ordinal_partition:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_rand_walks_ordinal_partition.csv 
+
+# Data with UUID and random walks for quantile graph
+amazon_data_with_rand_walks_quantile:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/amazon_data_with_rand_walks_quantile.csv 
+
+# The trained model dataset (containing UUID, random walks, and model)
+visibility_graph_embedding_model:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/visibility_graph_embedding_model.csv # Final output file path
+
+
+# The trained model dataset (containing UUID, random walks, and model)
+ordinal_partition_graph_embedding_model:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/ordinal_partition_graph_embedding_model.csv # Final output file path
+
+
+# The trained model dataset (containing UUID, random walks, and model)
+quantile_graph_embedding_model:
+  type: pandas.CSVDataset  # Data stored as a pandas DataFrame
+  filepath:  data/02_intermediate/quantile_graph_embedding_model.csv # Final output file path
diff --git a/conf/base/parameters.yml b/conf/base/parameters.yml
diff --git a/conf/base/spark.yml b/conf/base/spark.yml
@@ -0,0 +1,8 @@
+# You can define spark specific configuration here.
+
+spark.driver.maxResultSize: 3g
+spark.hadoop.fs.s3a.impl: org.apache.hadoop.fs.s3a.S3AFileSystem
+spark.sql.execution.arrow.pyspark.enabled: true
+
+# https://docs.kedro.org/en/stable/integrations/pyspark_integration.html#tips-for-maximising-concurrency-using-threadrunner
+spark.scheduler.mode: FAIR
diff --git a/conf/local/.gitkeep b/conf/local/.gitkeep