sensein · fabiocat93 · Sep 20, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/.github/workflows/e2c-runner-tests-310.yaml b/.github/workflows/e2c-runner-tests-310.yaml
@@ -0,0 +1,149 @@
+name: e2c-runner-tests-310
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, labeled]
+
+jobs:
+  start-runner:
+    if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'to-test-gpu')
+    name: start-runner
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+      job-ran: ${{ steps.set-ran.outputs.ran }}
+    steps:
+    - id: set-ran
+      run: echo "::set-output name=ran::true"
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_KEY_SECRET }}
+        aws-region: ${{ vars.AWS_REGION }}
+    - name: Start EC2 runner
+      id: start-ec2-runner
+      uses: machulav/ec2-github-runner@v2
+      with:
+        mode: start
+        github-token: ${{ secrets.GH_TOKEN }}
+        ec2-image-id: ${{ vars.AWS_IMAGE_ID }}
+        ec2-instance-type: ${{ vars.AWS_INSTANCE_TYPE }}
+        subnet-id: ${{ vars.AWS_SUBNET }}
+        security-group-id: ${{ vars.AWS_SECURITY_GROUP }}
+
+
+  ubuntu-tests-310:
+    name: ubuntu-tests-310
+    needs: start-runner
+    runs-on: ${{ needs.start-runner.outputs.label }}
+    defaults:
+      run:
+        shell: bash
+        working-directory: ${{ vars.WORKING_DIR }}
+    strategy:
+      matrix:
+        python-version: ['3.10']
+    env:
+      WORKING_DIR: ${{ vars.WORKING_DIR }}
+      POETRY_CACHE_DIR: ${{ vars.WORKING_DIR }}
+    outputs:
+      job-ran: ${{ steps.set-ran.outputs.ran }}
+    steps:
+    - id: set-ran
+      run: echo "::set-output name=ran::true"
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1   # no need for the history
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install ffmpeg (Ubuntu)
+      if: startsWith(matrix.os, 'ubuntu')
+      run: sudo apt-get update && sudo apt-get install -y ffmpeg
+      shell: bash
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+      with:
+        version: 1.7.1
+        virtualenvs-create: true
+        virtualenvs-in-project: true
+    - name: Check available space
+      run: |
+        df -h
+      shell: bash
+    - name: Echo python info
+      run: |
+        python --version
+        which python
+      shell: bash
+    - name: Copy senselab directory to current directory
+      run: |
+        cp -r /actions-runner/_work/senselab/senselab .
+    - name: Install dependencies with Poetry
+      run: |
+        cd senselab
+        poetry env use ${{ matrix.python-version }}
+        poetry run pip install iso-639
+        poetry install --with dev
+      shell: bash
+    - name: Check poetry info
+      run: |
+        cd senselab
+        poetry env info
+        poetry --version
+      shell: bash
+    - name: Check NVIDIA SMI details
+      run: |
+        cd senselab
+        poetry run nvidia-smi
+        poetry run nvidia-smi -L
+        poetry run nvidia-smi -q -d Memory
+      shell: bash
+    - name: Prepare cache folder for pytest
+      run: mkdir -p $WORKING_DIR/pytest/temp
+      shell: bash
+    - name: Run unit tests
+      id: run-tests
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: >
+        cd senselab && poetry run pytest \
+          --rootdir=$WORKING_DIR/pytest \
+          --basetemp=$WORKING_DIR/pytest/temp \
+          --junitxml=pytest.xml \
+          --cov-report=term-missing:skip-covered \
+          --cov-report=xml:coverage.xml \
+          --cov=src src/tests \
+          --log-level=DEBUG \
+          --verbose
+      shell: bash
+
+  stop-runner:
+    name: stop-runner
+    needs:
+    - start-runner   # waits for the EC2 instance to be created
+    - ubuntu-tests-310   # waits for the actual job to finish
+    runs-on: ubuntu-latest
+    if: ${{ needs.start-runner.outputs.job-ran == 'true' && needs.ubuntu-tests-310.outputs.job-ran == 'true' || failure() }} # required to stop the runner even if an error occurred in previous jobs
+    steps:
+    - name: Check available space
+      run: |
+        df -h
+      shell: bash
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v1
+      with:
+        aws-access-key-id: ${{ secrets.AWS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_KEY_SECRET }}
+        aws-region: ${{ vars.AWS_REGION }}
+    - name: Stop EC2 runner
+      uses: machulav/ec2-github-runner@v2
+      with:
+        mode: stop
+        github-token: ${{ secrets.GH_TOKEN }}
+        label: ${{ needs.start-runner.outputs.label }}
+        ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/test.yaml → .github/workflows/github-runner-tests.yaml b/.github/workflows/test.yaml → .github/workflows/github-runner-tests.yaml
@@ -1,55 +1,43 @@
-name: Python Tests
+name: github-runner-tests
 
 on:
   pull_request:
+    types: [opened, synchronize, reopened, labeled]
 
 jobs:
-  unit:
+  macos-tests:
+    if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'to-test')
+    name: macOS-tests
     runs-on: ${{ matrix.os }}
     strategy:
-      fail-fast: false
+      fail-fast: true
       matrix:
         include:
-        - {os: ubuntu-latest, architecture: x64, python-version: '3.10'}
-        - {os: ubuntu-latest, architecture: x64, python-version: '3.11'}
-        - {os: ubuntu-latest, architecture: x64, python-version: '3.12'}
-        - {os: macos-latest, architecture: x64, python-version: '3.10'}
         - {os: macos-latest, architecture: arm64, python-version: '3.10'}
-        - {os: macos-latest, architecture: x64, python-version: '3.11'}
-        - {os: macos-latest, architecture: arm64, python-version: '3.11'}
-        - {os: macos-latest, architecture: x64, python-version: '3.12'}
-        - {os: macos-latest, architecture: arm64, python-version: '3.12'}
-        # - {os: windows-latest, architecture: x64, python-version: '3.10'}
-        # - {os: windows-latest, architecture: x64, python-version: '3.11'}
-    env:
-      GITHUB_ACTIONS: true
+        # - {os: macos-latest, architecture: arm64, python-version: '3.11'}
+        # the reason why we commented out 3.11 is that it hits github rate limit for some modules (e.g., knn-vc, Camb-ai/mars5-tts)
     steps:
     - uses: actions/checkout@v4
-      with:  # no need for the history
-        fetch-depth: 1
+      with:
+        fetch-depth: 1   # no need for the history
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-
     - name: Install ffmpeg (Ubuntu)
       if: startsWith(matrix.os, 'ubuntu')
       run: sudo apt-get update && sudo apt-get install -y ffmpeg
+      shell: bash
     - name: Install ffmpeg (macOS)
       if: startsWith(matrix.os, 'macos')
       run: brew install ffmpeg
-    - name: Install ffmpeg (Windows)
-      if: startsWith(matrix.os, 'windows')
-      run: choco install ffmpeg
-
-    - name: Install pipx and ensure it's up to date
-      run: |
-        python -m pip install --upgrade pipx
-        pipx ensurepath
-      shell: bash
-    - name: Install poetry
-      run: pipx install poetry==1.7.1
       shell: bash
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+      with:
+        version: 1.7.1
+        virtualenvs-create: true
+        virtualenvs-in-project: true
     - name: Install dependencies with Poetry
       run: |
         poetry run pip install iso-639
@@ -58,9 +46,10 @@ jobs:
     - name: Run unit tests
       id: run-tests
       env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: >
-        poetry run pytest \
+        poetry run pytest -n auto \
           --junitxml=pytest.xml \
           --cov-report=term-missing:skip-covered \
           --cov-report=xml:coverage.xml \
@@ -74,11 +63,12 @@ jobs:
         token: ${{ secrets.CODECOV_TOKEN }}
 
   pre-commit:
+    if: github.event.pull_request.draft == false && contains(github.event.pull_request.labels.*.name, 'to-test')
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest] # For demonstration, other OSes are commented out: macos-latest, windows-latest
-        python-version: ['3.10'] # For speeding up the process we removed "3.11" for now
+        os: [ubuntu-latest]
+        python-version: ['3.10']
     steps:
     - uses: actions/checkout@v4
       with:  # no need for the history
@@ -87,14 +77,12 @@ jobs:
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Install pipx and ensure it's up to date
-      run: |
-        python -m pip install --upgrade pipx
-        pipx ensurepath
-      shell: bash
-    - name: Install poetry
-      run: pipx install poetry==1.7.1
-      shell: bash
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+      with:
+        version: 1.7.1
+        virtualenvs-create: true
+        virtualenvs-in-project: true
     - name: Install dependencies with Poetry
       run: |
         poetry run pip install iso-639
@@ -104,8 +92,6 @@ jobs:
       run: pipx install pre-commit
       shell: bash
     - name: Run pre-commit
-      env:
-        SKIP: pytest
       run: |
         poetry run pre-commit run --all-files
       shell: bash
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -73,13 +73,3 @@ repos:
     entry: YAML files must have .yaml extension.
     language: fail
     files: \.yml$
-
-- repo: local
-  hooks:
-  - id: pytest
-    name: pytest
-    entry: poetry run pytest --testmon
-    language: system
-    types: [python]
-    pass_filenames: false
-    always_run: true
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -55,7 +55,7 @@ If you feel that the functionality you have added to senselab requires some extr
 
 ### An example of well documented function following Google-style
 
-````
+```python
 import statistics
 from typing import Dict, List
 
@@ -99,4 +99,4 @@ def calculate_statistics(data: List[float]) -> Dict[str, float]:
         'variance': variance,
         'std_dev': std_dev
     }
-````
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,6 @@ classifiers = [
   "Development Status :: 3 - Alpha",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
-  "Programming Language :: Python :: 3.12",
   "License :: OSI Approved :: Apache Software License",
   "Operating System :: OS Independent"
 ]
@@ -62,16 +61,15 @@ vocos = "~=0.1"
 optional = true
 
 [tool.poetry.group.dev.dependencies]
-pytest = "~=8.2"
+pytest-xdist = {version = "~=3.6.1", extras = ["psutil"]}
 pytest-mock = "~=3.14"
+pytest-cov = "~=5.0"
 mypy = "~=1.9"
 pre-commit = "~=3.7"
-pytest-cov = "~=5.0"
 ruff = "~=0.3"
 codespell = "~=2.3"
 jupyter = "~=1.0"
 ipywidgets = "~=8.1"
-pytest-testmon = "~=2.1.1"
 
 [tool.poetry.group.docs]
 optional = true

diff --git a/src/senselab/audio/workflows/transcribe_timestamped/__init__.py b/src/senselab/audio/workflows/transcribe_timestamped/__init__.py
@@ -1,5 +1,8 @@
 """Workflow for timestamped transcription."""
 
+"""
+# TODO: Please double-check this because tests are failing
 from senselab.audio.workflows.transcribe_timestamped.transcribe_timestamped import transcribe_timestamped
 
 __all__ = ["transcribe_timestamped"]
+"""
diff --git a/src/senselab/audio/workflows/transcribe_timestamped/transcribe_timestamped.py b/src/senselab/audio/workflows/transcribe_timestamped/transcribe_timestamped.py
@@ -1,5 +1,7 @@
 """Transcribes audio files with timestamps."""
 
+'''
+# TODO: Please double-check this because tests are failing
 from typing import List
 
 import pydra
@@ -77,7 +79,7 @@ def transcribe_task(audios: List[Audio], model: HFModel, language: Language) ->
             model=wf.lzin.model,
             language=wf.lzin.language,
         )
-    ).split("batched_audios", batched_audios=wf.inputs.batched_audios)
+    ).split("batched_audios", batched_audios=wf.transcribe.lzin.batched_audios)
 
     align_transcriptions_task = pydra.mark.task(align_transcriptions)
     wf.add(
@@ -99,3 +101,4 @@ def transcribe_task(audios: List[Audio], model: HFModel, language: Language) ->
         sub(wf)
 
     return wf.result()[0].output.aligned_transcriptions
+'''
diff --git a/src/senselab/text/tasks/embeddings_extraction/huggingface.py b/src/senselab/text/tasks/embeddings_extraction/huggingface.py
@@ -78,6 +78,9 @@ def extract_text_embeddings(
         device, _ = _select_device_and_dtype(
             user_preference=device, compatible_devices=[DeviceType.CUDA, DeviceType.CPU]
         )
+
+        print(f"Using device: {device}")
+
         # Load tokenizer and model
         tokenizer = cls._get_tokenizer(model=model)
         ssl_model = cls._load_model(model=model, device=device)
@@ -87,13 +90,15 @@ def extract_text_embeddings(
         # Process each piece of text individually
         for text in pieces_of_text:
             # Tokenize sentence
-            encoded_input = tokenizer(text, return_tensors="pt").to(device)
+            encoded_input = tokenizer(text, return_tensors="pt").to(device.value)
 
             # Compute token embeddings
             with torch.no_grad():
                 model_output = ssl_model(**encoded_input, output_hidden_states=True)
                 hidden_states = model_output.hidden_states
-                concatenated_hidden_states = torch.cat([state.unsqueeze(0) for state in hidden_states], dim=0)
+                concatenated_hidden_states = torch.cat(
+                    [state.to(device.value).unsqueeze(0) for state in hidden_states], dim=0
+                )
                 embeddings.append(concatenated_hidden_states.squeeze())
 
         return embeddings