ran ruff

rapidsai · Oct 11, 2024 · 9802cbf · 9802cbf
1 parent 7c07bbe
commit 9802cbf
Show file tree

Hide file tree

Showing 32 changed files with 899 additions and 1,269 deletions.
diff --git a/extensions/rapids_notebook_files.py b/extensions/rapids_notebook_files.py
@@ -16,9 +16,7 @@ def walk_files(app, dir, outdir):
     related_notebook_files = {}
     for page in dir.glob("*"):
         if page.is_dir():
-            related_notebook_files[page.name] = walk_files(
-                app, page, outdir / page.name
-            )
+            related_notebook_files[page.name] = walk_files(app, page, outdir / page.name)
         else:
             with contextlib.suppress(OSError):
                 os.remove(str(outdir / page.name))
@@ -59,9 +57,7 @@ def find_notebook_related_files(app, pagename, templatename, context, doctree):
         path_to_output_parent = output_root / rel_page_parent
 
         # Copy all related files to output and apply templating
-        related_notebook_files = walk_files(
-            app, path_to_page_parent, path_to_output_parent
-        )
+        related_notebook_files = walk_files(app, path_to_page_parent, path_to_output_parent)
 
         # Make archive of related files
         if related_notebook_files and len(related_notebook_files) > 1:

diff --git a/extensions/rapids_related_examples.py b/extensions/rapids_related_examples.py
@@ -22,9 +22,7 @@ def read_notebook_tags(path: str) -> list[str]:
         return []
 
 
-def generate_notebook_grid_myst(
-    notebooks: list[str], env: BuildEnvironment
-) -> list[str]:
+def generate_notebook_grid_myst(notebooks: list[str], env: BuildEnvironment) -> list[str]:
     """Generate sphinx-design grid of notebooks in MyST markdown.
 
     Take a list of notebook documents and render out some MyST markdown displaying those
@@ -75,11 +73,7 @@ def get_title_for_notebook(path: str) -> str:
                 if i == len(cell_source) - 1:  # no next_token
                     continue
                 next_token = cell_source[i + 1]
-                if (
-                    token.type == "heading_open"
-                    and token.tag == "h1"
-                    and next_token.type == "inline"
-                ):
+                if token.type == "heading_open" and token.tag == "h1" and next_token.type == "inline":
                     return next_token.content
     raise ValueError("No top-level heading found")
 
@@ -146,9 +140,7 @@ def add_notebook_tag_map_to_context(app, pagename, templatename, context, doctre
         except KeyError:
             tag_tree[root] = [suffix]
     context["notebook_tag_tree"] = tag_tree
-    context["notebook_tags"] = [
-        tag for tag, pages in app.env.notebook_tag_map.items() if pagename in pages
-    ]
+    context["notebook_tags"] = [tag for tag, pages in app.env.notebook_tag_map.items() if pagename in pages]
 
 
 class NotebookGalleryTocTree(TocTree):
@@ -162,9 +154,7 @@ def run(self) -> list[nodes.Node]:
         output += toctree
 
         # Generate the card grid for all items in the toctree
-        notebooks = [
-            notebook for _, notebook in toctree[0].children[0].attributes["entries"]
-        ]
+        notebooks = [notebook for _, notebook in toctree[0].children[0].attributes["entries"]]
         grid_markdown = generate_notebook_grid_myst(notebooks=notebooks, env=self.env)
         for node in parse_markdown(markdown=grid_markdown, state=self.state):
             gallery += node

diff --git a/extensions/rapids_version_templating.py b/extensions/rapids_version_templating.py
@@ -49,9 +49,7 @@ def visit_reference(self, node: nodes.reference) -> None:
         uri_str = re.sub(r"~~~(.*?)~~~", r"{{ \1 }}", uri_str)
 
         # fill in appropriate values based on app context
-        node.attributes["refuri"] = re.sub(
-            r"(?<!\$)\{\{.*?\}\}", self.template_func, uri_str
-        )
+        node.attributes["refuri"] = re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, uri_str)
 
         # update the document
         node.parent.replace(node, node)
@@ -61,19 +59,15 @@ def visit_Text(self, node: nodes.Text) -> None:
         Replace template strings in generic text.
         This roughly corresponds to HTML ``<p>``, ``<pre>``, and similar elements.
         """
-        new_node = nodes.Text(
-            re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, node.astext())
-        )
+        new_node = nodes.Text(re.sub(r"(?<!\$)\{\{.*?\}\}", self.template_func, node.astext()))
         node.parent.replace(node, new_node)
 
     def template_func(self, match: re.Match) -> str:
         """
         Replace template strings like ``{{ rapids_version }}`` with real
         values like ``24.10``.
         """
-        return self.app.builder.templates.render_string(
-            source=match.group(), context=self.app.config.rapids_version
-        )
+        return self.app.builder.templates.render_string(source=match.group(), context=self.app.config.rapids_version)
 
 
 def version_template(

diff --git a/source/conf.py b/source/conf.py
@@ -43,18 +43,12 @@
     },
 }
 rapids_version = (
-    versions["stable"]
-    if os.environ.get("DEPLOYMENT_DOCS_BUILD_STABLE", "false") == "true"
-    else versions["nightly"]
+    versions["stable"] if os.environ.get("DEPLOYMENT_DOCS_BUILD_STABLE", "false") == "true" else versions["nightly"]
 )
 rapids_version["rapids_conda_channels_list"] = [
-    channel
-    for channel in rapids_version["rapids_conda_channels"].split(" ")
-    if channel != "-c"
+    channel for channel in rapids_version["rapids_conda_channels"].split(" ") if channel != "-c"
 ]
-rapids_version["rapids_conda_packages_list"] = rapids_version[
-    "rapids_conda_packages"
-].split(" ")
+rapids_version["rapids_conda_packages_list"] = rapids_version["rapids_conda_packages"].split(" ")
 
 # -- General configuration ---------------------------------------------------
 
@@ -94,9 +88,7 @@
 # -- Options for notebooks -------------------------------------------------
 
 nb_execution_mode = "off"
-rapids_deployment_notebooks_base_url = (
-    "https://github.com/rapidsai/deployment/blob/main/source/"
-)
+rapids_deployment_notebooks_base_url = "https://github.com/rapidsai/deployment/blob/main/source/"
 
 # -- Options for HTML output -------------------------------------------------
 
@@ -146,8 +138,6 @@
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_css_file("css/custom.css")
-    app.add_js_file(
-        "https://docs.rapids.ai/assets/js/custom.js", loading_method="defer"
-    )
+    app.add_js_file("https://docs.rapids.ai/assets/js/custom.js", loading_method="defer")
     app.add_js_file("js/nav.js", loading_method="defer")
     app.add_js_file("js/notebook-gallery.js", loading_method="defer")
diff --git a/source/examples/rapids-1brc-single-node/notebook.ipynb b/source/examples/rapids-1brc-single-node/notebook.ipynb
@@ -200,9 +200,7 @@
    "source": [
     "n = 1_000_000_000  # Number of rows of data to generate\n",
     "\n",
-    "lookup_df = cudf.read_csv(\n",
-    "    \"lookup.csv\"\n",
-    ")  # Load our lookup table of stations and their mean temperatures\n",
+    "lookup_df = cudf.read_csv(\"lookup.csv\")  # Load our lookup table of stations and their mean temperatures\n",
     "std = 10.0  # We assume temperatures are normally distributed with a standard deviation of 10\n",
     "chunksize = 2e8  # Set the number of rows to generate in one go (reduce this if you run into GPU RAM limits)\n",
     "filename = Path(\"measurements.txt\")  # Choose where to write to\n",

diff --git a/source/examples/rapids-autoscaling-multi-tenant-kubernetes/notebook.ipynb b/source/examples/rapids-autoscaling-multi-tenant-kubernetes/notebook.ipynb
@@ -995,12 +995,8 @@
     "\n",
     "\n",
     "def map_haversine(part):\n",
-    "    pickup = cuspatial.GeoSeries.from_points_xy(\n",
-    "        part[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns()\n",
-    "    )\n",
-    "    dropoff = cuspatial.GeoSeries.from_points_xy(\n",
-    "        part[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns()\n",
-    "    )\n",
+    "    pickup = cuspatial.GeoSeries.from_points_xy(part[[\"pickup_longitude\", \"pickup_latitude\"]].interleave_columns())\n",
+    "    dropoff = cuspatial.GeoSeries.from_points_xy(part[[\"dropoff_longitude\", \"dropoff_latitude\"]].interleave_columns())\n",
     "    return cuspatial.haversine_distance(pickup, dropoff)\n",
     "\n",
     "\n",
@@ -1506,9 +1502,7 @@
     "from random import randrange\n",
     "\n",
     "\n",
-    "def generate_workload(\n",
-    "    stages=3, min_width=1, max_width=3, variation=1, input_workload=None\n",
-    "):\n",
+    "def generate_workload(stages=3, min_width=1, max_width=3, variation=1, input_workload=None):\n",
     "    graph = [input_workload] if input_workload is not None else [run_haversine()]\n",
     "    last_width = min_width\n",
     "    for _ in range(stages):\n",
@@ -1646,35 +1640,25 @@
    ],
    "source": [
     "%%time\n",
-    "start_time = (datetime.datetime.now() - datetime.timedelta(minutes=15)).strftime(\n",
-    "    \"%Y-%m-%dT%H:%M:%SZ\"\n",
-    ")\n",
+    "start_time = (datetime.datetime.now() - datetime.timedelta(minutes=15)).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n",
     "try:\n",
     "    # Start with a couple of concurrent workloads\n",
     "    workload = generate_workload(stages=10, max_width=2)\n",
     "    # Then increase demand as more users appear\n",
-    "    workload = generate_workload(\n",
-    "        stages=5, max_width=5, min_width=3, variation=5, input_workload=workload\n",
-    "    )\n",
+    "    workload = generate_workload(stages=5, max_width=5, min_width=3, variation=5, input_workload=workload)\n",
     "    # Now reduce the workload for a longer period of time, this could be over a lunchbreak or something\n",
     "    workload = generate_workload(stages=30, max_width=2, input_workload=workload)\n",
     "    # Everyone is back from lunch and it hitting the cluster hard\n",
-    "    workload = generate_workload(\n",
-    "        stages=10, max_width=10, min_width=3, variation=5, input_workload=workload\n",
-    "    )\n",
+    "    workload = generate_workload(stages=10, max_width=10, min_width=3, variation=5, input_workload=workload)\n",
     "    # The after lunch rush is easing\n",
-    "    workload = generate_workload(\n",
-    "        stages=5, max_width=5, min_width=3, variation=5, input_workload=workload\n",
-    "    )\n",
+    "    workload = generate_workload(stages=5, max_width=5, min_width=3, variation=5, input_workload=workload)\n",
     "    # As we get towards the end of the day demand slows off again\n",
     "    workload = generate_workload(stages=10, max_width=2, input_workload=workload)\n",
     "    workload.compute()\n",
     "finally:\n",
     "    client.close()\n",
     "    cluster.close()\n",
-    "    end_time = (datetime.datetime.now() + datetime.timedelta(minutes=15)).strftime(\n",
-    "        \"%Y-%m-%dT%H:%M:%SZ\"\n",
-    "    )"
+    "    end_time = (datetime.datetime.now() + datetime.timedelta(minutes=15)).strftime(\"%Y-%m-%dT%H:%M:%SZ\")"
    ]
   },
   {
@@ -2037,14 +2021,10 @@
     "    end_time,\n",
     "    \"1s\",\n",
     ")\n",
-    "running_pods = running_pods[\n",
-    "    running_pods.columns.drop(list(running_pods.filter(regex=\"prepull\")))\n",
-    "]\n",
+    "running_pods = running_pods[running_pods.columns.drop(list(running_pods.filter(regex=\"prepull\")))]\n",
     "nodes = p.query_range(\"count(kube_node_info)\", start_time, end_time, \"1s\")\n",
     "nodes.columns = [\"Available GPUs\"]\n",
-    "nodes[\"Available GPUs\"] = (\n",
-    "    nodes[\"Available GPUs\"] * 2\n",
-    ")  # We know our nodes each had 2 GPUs\n",
+    "nodes[\"Available GPUs\"] = nodes[\"Available GPUs\"] * 2  # We know our nodes each had 2 GPUs\n",
     "nodes[\"Utilized GPUs\"] = running_pods.sum(axis=1)"
    ]
   },

diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb
@@ -97,7 +97,6 @@
     "from azure.ai.ml import MLClient\n",
     "from azure.identity import DefaultAzureCredential\n",
     "\n",
-    "\n",
     "subscription_id = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n",
     "resource_group_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n",
     "workspace_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n",
@@ -219,9 +218,7 @@
     "    )\n",
     "    ml_client.compute.begin_create_or_update(gpu_target).result()\n",
     "\n",
-    "    print(\n",
-    "        f\"AMLCompute with name {gpu_target.name} is created, the compute size is {gpu_target.size}\"\n",
-    "    )"
+    "    print(f\"AMLCompute with name {gpu_target.name} is created, the compute size is {gpu_target.size}\")"
    ]
   },
   {
@@ -488,9 +485,7 @@
     "\n",
     "\n",
     "# Define the limits for this sweep\n",
-    "sweep_job.set_limits(\n",
-    "    max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600\n",
-    ")\n",
+    "sweep_job.set_limits(max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600)\n",
     "\n",
     "\n",
     "# Specify your experiment details\n",

diff --git a/source/examples/rapids-azureml-hpo/rapids_csp_azure.py b/source/examples/rapids-azureml-hpo/rapids_csp_azure.py
@@ -132,9 +132,7 @@ def load_hyperparams(self, model_name="XGBoost"):
             self.log_to_file(str(error))
             return
 
-    def load_data(
-        self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBinary"
-    ):
+    def load_data(self, filename="dataset.orc", col_labels=None, y_label="ArrDelayBinary"):
         """
         Loading the data into the object from the filename and based on the columns that we are
         interested in. Also, generates y_label from 'ArrDelay' column to convert this into a binary
@@ -185,9 +183,7 @@ def load_data(
 
                     elif "multi" in self.compute_type:
                         self.log_to_file("\n\tReading using dask dataframe")
-                        dataset = dask.dataframe.read_parquet(
-                            target_filename, columns=col_labels
-                        )
+                        dataset = dask.dataframe.read_parquet(target_filename, columns=col_labels)
 
             elif "GPU" in self.compute_type:
                 # GPU Reading Option
@@ -205,9 +201,7 @@ def load_data(
 
                     elif "multi" in self.compute_type:
                         self.log_to_file("\n\tReading using dask_cudf")
-                        dataset = dask_cudf.read_parquet(
-                            target_filename, columns=col_labels
-                        )
+                        dataset = dask_cudf.read_parquet(target_filename, columns=col_labels)
 
         # cast all columns to float32
         for col in dataset.columns:
@@ -222,14 +216,10 @@ def load_data(
         dataset = dataset.fillna(0.0)  # Filling the null values. Needed for dask-cudf
 
         self.log_to_file(f"\n\tIngestion completed in {ingestion_timer.duration}")
-        self.log_to_file(
-            f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}"
-        )
+        self.log_to_file(f"\n\tDataset descriptors: {dataset.shape}\n\t{dataset.dtypes}")
         return dataset, col_labels, y_label, ingestion_timer.duration
 
-    def split_data(
-        self, dataset, y_label, train_size=0.8, random_state=0, shuffle=True
-    ):
+    def split_data(self, dataset, y_label, train_size=0.8, random_state=0, shuffle=True):
         """
         Splitting data into train and test split, has appropriate imports for different compute modes.
         CPU compute - Uses sklearn, we manually filter y_label column in the split call
@@ -321,13 +311,9 @@ def train_model(self, X_train, y_train, model_params):
 
         try:
             if self.model_type == "XGBoost":
-                trained_model, training_time = self.fit_xgboost(
-                    X_train, y_train, model_params
-                )
+                trained_model, training_time = self.fit_xgboost(X_train, y_train, model_params)
             elif self.model_type == "RandomForest":
-                trained_model, training_time = self.fit_random_forest(
-                    X_train, y_train, model_params
-                )
+                trained_model, training_time = self.fit_random_forest(X_train, y_train, model_params)
         except Exception as error:
             self.log_to_file("\n\n!error during model training: " + str(error))
         self.log_to_file(f"\n\tFinished training in {training_time:.4f} s")
@@ -354,9 +340,7 @@ def fit_xgboost(self, X_train, y_train, model_params):
                 )
             elif "multi" in self.compute_type:
                 self.log_to_file("\n\tTraining multi-GPU XGBoost")
-                train_DMatrix = xgboost.dask.DaskDMatrix(
-                    self.client, data=X_train, label=y_train
-                )
+                train_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_train, label=y_train)
                 trained_model = xgboost.dask.train(
                     self.client,
                     dtrain=train_DMatrix,
@@ -441,12 +425,8 @@ def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
             try:
                 if self.model_type == "XGBoost":
                     if "multi" in self.compute_type:
-                        test_DMatrix = xgboost.dask.DaskDMatrix(
-                            self.client, data=X_test, label=y_test
-                        )
-                        xgb_pred = xgboost.dask.predict(
-                            self.client, trained_model, test_DMatrix
-                        ).compute()
+                        test_DMatrix = xgboost.dask.DaskDMatrix(self.client, data=X_test, label=y_test)
+                        xgb_pred = xgboost.dask.predict(self.client, trained_model, test_DMatrix).compute()
                         xgb_pred = (xgb_pred > threshold) * 1.0
                         test_accuracy = accuracy_score(y_test.compute(), xgb_pred)
                     elif "single" in self.compute_type:
@@ -459,13 +439,9 @@ def evaluate_test_perf(self, trained_model, X_test, y_test, threshold=0.5):
                     if "multi" in self.compute_type:
                         cuml_pred = trained_model.predict(X_test).compute()
                         self.log_to_file("\n\tPrediction complete")
-                        test_accuracy = accuracy_score(
-                            y_test.compute(), cuml_pred, convert_dtype=True
-                        )
+                        test_accuracy = accuracy_score(y_test.compute(), cuml_pred, convert_dtype=True)
                     elif "single" in self.compute_type:
-                        test_accuracy = trained_model.score(
-                            X_test, y_test.astype("int32")
-                        )
+                        test_accuracy = trained_model.score(X_test, y_test.astype("int32"))
 
             except Exception as error:
                 self.log_to_file("\n\n!error during inference: " + str(error))