Merge pull request #11 from ScaDS/codegen

more code generation exercises
ScaDS · Aug 26, 2024 · f456ccd · f456ccd
2 parents b33831a + b5e72bb
commit f456ccd
Show file tree

Hide file tree

Showing 6 changed files with 820 additions and 1 deletion.
diff --git a/docs/30_function_calling/35_langchain_bia_choosing_algorithms.ipynb b/docs/30_function_calling/35_langchain_bia_choosing_algorithms.ipynb
diff --git a/docs/30_function_calling/bia_utilities.py b/docs/30_function_calling/bia_utilities.py
@@ -0,0 +1,77 @@
+def voronoi_otsu_labeling(image, spot_sigma: float = 2, outline_sigma: float = 2):
+    """Voronoi-Otsu-Labeling is a segmentation algorithm for blob-like structures such as nuclei and
+    granules with high signal intensity on low-intensity background.
+
+    The two sigma parameters allow tuning the segmentation result. The first sigma controls how close detected cells
+    can be (spot_sigma) and the second controls how precise segmented objects are outlined (outline_sigma). Under the
+    hood, this filter applies two Gaussian blurs, spot detection, Otsu-thresholding and Voronoi-labeling. The
+    thresholded binary image is flooded using the Voronoi approach starting from the found local maxima. Noise-removal
+    sigma for spot detection and thresholding can be configured separately.
+
+    This allows segmenting connected objects such as not to dense nuclei.
+    If the nuclei are too dense, consider using stardist [1] or cellpose [2].
+
+    See also
+    --------
+    .. [0] https://github.com/clEsperanto/pyclesperanto_prototype/blob/master/demo/segmentation/voronoi_otsu_labeling.ipynb
+    .. [1] https://www.napari-hub.org/plugins/stardist-napari
+    .. [2] https://www.napari-hub.org/plugins/cellpose-napari
+    """
+    import numpy as np
+    from skimage.filters import threshold_otsu as sk_threshold_otsu, gaussian
+    from skimage.segmentation import watershed
+    from skimage.measure import label
+    from skimage.morphology import local_maxima
+
+
+    image = np.asarray(image)
+
+    # blur and detect local maxima
+    blurred_spots = gaussian(image, spot_sigma)
+    spot_centroids = local_maxima(blurred_spots)
+
+    # blur and threshold
+    blurred_outline = gaussian(image, outline_sigma)
+    threshold = sk_threshold_otsu(blurred_outline)
+    binary_otsu = blurred_outline > threshold
+
+    # determine local maxima within the thresholded area
+    remaining_spots = spot_centroids * binary_otsu
+
+    # start from remaining spots and flood binary image with labels
+    labeled_spots = label(remaining_spots)
+    labels = watershed(binary_otsu, labeled_spots, mask=binary_otsu)
+
+    return labels
+
+
+def local_minima_seeded_watershed(image, spot_sigma: float = 10, outline_sigma: float = 0):
+    """
+    Segment cells in images with fluorescently marked membranes.
+
+    The two sigma parameters allow tuning the segmentation result. The first sigma controls how close detected cells
+    can be (spot_sigma) and the second controls how precise segmented objects are outlined (outline_sigma). Under the
+    hood, this filter applies two Gaussian blurs, local minima detection and a seeded watershed.
+
+    See also
+    --------
+    .. [1] https://scikit-image.org/docs/dev/auto_examples/segmentation/plot_watershed.html
+    """
+    import numpy as np
+    from skimage.filters import gaussian
+    from skimage.segmentation import watershed
+    from skimage.measure import label
+    from skimage.morphology import local_minima
+
+    image = np.asarray(image)
+
+    spot_blurred = gaussian(image, sigma=spot_sigma)
+
+    spots = label(local_minima(spot_blurred))
+
+    if outline_sigma == spot_sigma:
+        outline_blurred = spot_blurred
+    else:
+        outline_blurred = gaussian(image, sigma=outline_sigma)
+
+    return watershed(outline_blurred, spots)
diff --git a/docs/30_function_calling/data/membrane2d.tif → docs/30_function_calling/data/membranes.tif b/docs/30_function_calling/data/membrane2d.tif → docs/30_function_calling/data/membranes.tif
diff --git a/docs/50_code_generation/02_generating_code.ipynb b/docs/50_code_generation/02_generating_code.ipynb
@@ -0,0 +1,242 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "d2203d9d-f5cc-4198-bd78-1d56f98369ac",
+   "metadata": {},
+   "source": [
+    "# Code generation\n",
+    "Code generation makes sense in the context of scientific data analysis especially because code can be executed again and again producing the same results. \n",
+    "\n",
+    "As example, we count bright blobs in an image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aaa383f2-762d-4816-95f3-b259e4723878",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "from IPython.display import Markdown\n",
+    "\n",
+    "def prompt(message:str, model=\"gpt-4o-2024-08-06\"):\n",
+    "    \"\"\"A prompt helper function that sends a message to openAI\n",
+    "    and returns only the text response.\n",
+    "    \"\"\"\n",
+    "    client = openai.OpenAI()\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=model,\n",
+    "        messages=[{\"role\": \"user\", \"content\": message}]\n",
+    "    )\n",
+    "    return response.choices[0].message.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a8aec675-cde8-4870-ac52-fafac89d543e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_prompt = \"\"\"\n",
+    "Write Python code that loads blobs.tif, \n",
+    "counts bright blobs and prints the number. \n",
+    "Return this code only.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc649cf5-2194-4bd4-ac0f-52b98607430c",
+   "metadata": {},
+   "source": [
+    "## Reviewing generated code\n",
+    "When generating code, it is recommended to print / visualize before executing it. If you automatically execute code before reviewing it, it may harm your computer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "06ddee05-5e7f-4656-a877-a8ee61c84c10",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "```python\n",
+       "import cv2\n",
+       "import numpy as np\n",
+       "\n",
+       "# Load the image\n",
+       "image = cv2.imread('blobs.tif', cv2.IMREAD_GRAYSCALE)\n",
+       "\n",
+       "# Thresholding the image to binary\n",
+       "_, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)\n",
+       "\n",
+       "# Find connected components (blobs)\n",
+       "num_labels, labels_im = cv2.connectedComponents(binary_image)\n",
+       "\n",
+       "# Print the number of bright blobs, subtracting one for the background label\n",
+       "print(num_labels - 1)\n",
+       "```"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "code = prompt(my_prompt)\n",
+    "Markdown(code)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5dc2999-f4b9-434d-a923-c286f8667a7f",
+   "metadata": {},
+   "source": [
+    "If we are ok with the code, we can execute it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "83a50d24-d1cd-4541-a0f2-fb59af5608c6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "64\n"
+     ]
+    }
+   ],
+   "source": [
+    "code = code.replace(\"```python\", \"\").replace(\"```\", \"\")\n",
+    "exec(code)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5838196-e538-4026-9b74-6638166e6c6e",
+   "metadata": {},
+   "source": [
+    "## A comment on reproducibility\n",
+    "Depending on which model you use, it may produce the same code again - or not."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "03adabb8-b14e-4467-8ce8-02dcd5efc9dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/markdown": [
+       "```python\n",
+       "import numpy as np\n",
+       "from skimage import io, measure, filters, morphology\n",
+       "\n",
+       "# Load the image\n",
+       "image = io.imread('blobs.tif')\n",
+       "\n",
+       "# Convert image to grayscale if it is not\n",
+       "if len(image.shape) > 2:\n",
+       "    image = rgb2gray(image)\n",
+       "\n",
+       "# Apply a threshold to convert the image to binary\n",
+       "binary_image = image > filters.threshold_otsu(image)\n",
+       "\n",
+       "# Remove small objects to isolate blobs better\n",
+       "cleaned_image = morphology.remove_small_objects(binary_image, min_size=20)\n",
+       "\n",
+       "# Label connected components\n",
+       "labeled_image, num_blobs = measure.label(cleaned_image, return_num=True)\n",
+       "\n",
+       "# Print the number of blobs found\n",
+       "print(\"Number of bright blobs:\", num_blobs)\n",
+       "```"
+      ],
+      "text/plain": [
+       "<IPython.core.display.Markdown object>"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "code = prompt(my_prompt)\n",
+    "Markdown(code)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "d393a342-fea7-42b8-a069-31bfe81a340e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of bright blobs: 61\n"
+     ]
+    }
+   ],
+   "source": [
+    "code = code.replace(\"```python\", \"\").replace(\"```\", \"\")\n",
+    "exec(code)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6559154f-8bb3-4524-8a33-8b76e05d3679",
+   "metadata": {},
+   "source": [
+    "# Exercise\n",
+    "Rerun the code, wait a minute between the two `prompt()` calls above and see if the code is identical to what is saved now. Also check if the number of cells is the same.\n",
+    "\n",
+    "Advanced, optional exercise: Modify the prompt function to use Anthropic's Claude."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e3659ba-5b62-4ee9-997b-05d7fdde6d7b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/50_code_generation/03_generating_code.ipynb b/docs/50_code_generation/03_generating_code.ipynb
@@ -5,7 +5,7 @@
    "id": "85990a02-74d0-46f2-a638-00f37d8d1b3e",
    "metadata": {},
    "source": [
-    "## Generating code\n",
+    "## Prompt engineering for code generation\n",
     "In this notebook we will produce some image processing Python code and execute it to see if it works. We will build up the query gradually to demonstrate that short concise high quality code depends on the details we present in the prompt."
    ]
   },

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -38,6 +38,7 @@ parts:
       - file: 30_function_calling/10_function_calling.ipynb
       - file: 30_function_calling/20_langchain.ipynb
       - file: 30_function_calling/30_langchain_bia.ipynb
+      - file: 30_function_calling/35_langchain_bia_choosing_algorithms.ipynb
       - file: 30_function_calling/50_blablado.ipynb
       - file: 30_function_calling/55_microscope_stage_demo.ipynb
 
@@ -66,6 +67,7 @@ parts:
 
     - file: 50_code_generation/readme.md
       sections:
+      - file: 50_code_generation/02_generating_code.ipynb
       - file: 50_code_generation/03_generating_code.ipynb
       - file: 50_code_generation/04_generating_code_for_processing_images.ipynb
       - file: 50_code_generation/06_system_messages.ipynb