SYSTRAN · HRashidi · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,53 @@
+name: Publish Python Package
+
+on:
+  release:
+    types: [published]
+  workflow_dispatch:
+    inputs:
+      publish_target:
+        description: 'Select the target PyPI repository'
+        required: true
+        default: 'testpypi'
+        type: choice
+        options:
+          - pypi
+          - testpypi
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v5
+      with:
+        python-version: "3.10"
+
+    - name: Install publish requirements
+      run: pip3 install setuptools wheel twine
+
+    - name: Install build dependencies
+      run: pip3 install wheel
+
+    - name: Build the package
+      run: python3 setup.py sdist bdist_wheel
+
+    - name: Publish to PyPI
+      if: github.event_name == 'release' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'pypi')
+      env:
+        TWINE_USERNAME: "__token__"
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
+      run: |
+        twine upload -r testpypi dist/*
+
+    - name: Publish to Test PyPI
+      if: github.event_name == 'workflow_dispatch' && github.event.inputs.publish_target == 'testpypi'
+      env:
+        TWINE_USERNAME: "__token__"
+        TWINE_PASSWORD: ${{ secrets.TEST_PYPI_API_TOKEN }}
+      run: |
+        twine upload dist/*
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -520,13 +520,17 @@ def transcribe(
         audio_segments = torch.nested.nested_tensor(audio_segments).to_padded_tensor(
             padding=0
         )
-        features = torch.stack(
-            [
-                self.model.feature_extractor(audio_segment, to_cpu=to_cpu)[
-                    ..., : self.model.feature_extractor.nb_max_frames
+        features = ( 
+            torch.stack(
+                [
+                    self.model.feature_extractor(audio_segment, to_cpu=to_cpu)[
+                        ..., : self.model.feature_extractor.nb_max_frames
+                    ]
+                    for audio_segment in audio_segments
                 ]
-                for audio_segment in audio_segments
-            ]
+            )
+            if duration_after_vad
+            else []
         )
 
         segments = self._batched_segments_generator(
@@ -944,13 +948,19 @@ def transcribe(
                     )
                     seek += segment.shape[-1]
                 else:
-                    # If no language detected for all segments, the majority vote of the highest
-                    # projected languages for all segments is used to determine the language.
-                    language = max(
-                        detected_language_info,
-                        key=lambda lang: len(detected_language_info[lang]),
-                    )
-                    language_probability = max(detected_language_info[language])
+                    if detected_language_info:
+                        # If no language detected for all segments, the majority vote of the highest
+                        # projected languages for all segments is used to determine the language.
+                        language = max(
+                            detected_language_info,
+                            key=lambda lang: len(detected_language_info[lang]),
+                        )
+                        language_probability = max(detected_language_info[language])
+                    else:
+                        # It's possible VAD removes all segments due to no voice,
+                        # then it doesn't matter which language
+                        language = "en"
+                        language_probability = 0
 
                 self.logger.info(
                     "Detected language '%s' with probability %.2f",

diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
@@ -26,6 +26,8 @@
     "distil-medium.en": "Systran/faster-distil-whisper-medium.en",
     "distil-small.en": "Systran/faster-distil-whisper-small.en",
     "distil-large-v3": "Systran/faster-distil-whisper-large-v3",
+    "large-v3-trubo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
+    "turbo": "mobiuslabsgmbh/faster-whisper-large-v3-turbo",
 }
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,6 @@ huggingface_hub>=0.13
 tokenizers>=0.13,<1
 onnxruntime>=1.14,<2 
 pyannote-audio>=3.1.1
-torch>=2.1.1 
-torchaudio>=2.1.2
+torch>=2.1.1,<2.4.0
+torchaudio>=2.1.2,<2.4.0
 tqdm