v0.2.17: Fixes malformed transcript critical bug

souzatharsis · Oct 31, 2024 · a18b91a · a18b91a
1 parent 035862c
commit a18b91a
Show file tree

Hide file tree

Showing 6 changed files with 126 additions and 11 deletions.
diff --git a/podcastfy/__init__.py b/podcastfy/__init__.py
@@ -1,2 +1,2 @@
 # This file can be left empty for now
-__version__ = "0.2.16"  # or whatever version you're on
+__version__ = "0.2.17"  # or whatever version you're on
diff --git a/podcastfy/config.yaml b/podcastfy/config.yaml
@@ -2,7 +2,7 @@ content_generator:
   gemini_model: "gemini-1.5-pro-latest"
   max_output_tokens: 8192
   prompt_template: "souzatharsis/podcastfy_multimodal_cleanmarkup"
-  prompt_commit: "3d5b42fc"
+  prompt_commit: "23094489"
 content_extractor:
   youtube_url_patterns:
     - "youtube.com"

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
@@ -258,12 +258,12 @@ def generate_qa_content(
                 image_file_paths, image_path_keys, input_texts
             )
 
-            response_raw = self.chain.invoke(
+            self.response = self.chain.invoke(
                 prompt_params
             )  # in the future, make sure we have structured output
 
             # Clean up scratchpad blocks from response
-            self.response = self.__clean_scratchpad(response_raw)
+            self.response = self.__clean_tss_markup(self.response)
 
             logger.info(f"Content generated successfully")
 
@@ -278,6 +278,49 @@ def generate_qa_content(
             raise
 
 
+    def __clean_tss_markup(self, input_text: str, additional_tags: List[str] = ["Person1", "Person2"]) -> str:
+        """
+        Remove unsupported TSS markup tags from the input text while preserving supported SSML tags.
+
+        Args:
+            input_text (str): The input text containing TSS markup tags.
+			additional_tags (List[str]): Optional list of additional tags to preserve. Defaults to ["Person1", "Person2"].
+
+		Returns:
+			str: Cleaned text with unsupported TSS markup tags removed.
+		"""
+        # List of SSML tags supported by both OpenAI and ElevenLabs
+        supported_tags = [
+            "speak", "lang", "p", "phoneme",
+            "s", "sub"
+        ]
+
+        # Append additional tags to the supported tags list
+        supported_tags.extend(additional_tags)
+
+        # Create a pattern that matches any tag not in the supported list
+        pattern = r'</?(?!(?:' + '|'.join(supported_tags) + r')\b)[^>]+>'
+
+        # Remove unsupported tags
+        cleaned_text = re.sub(pattern, '', input_text)
+
+        # Remove any leftover empty lines
+        cleaned_text = re.sub(r'\n\s*\n', '\n', cleaned_text)
+
+        # Ensure closing tags for additional tags are preserved
+        for tag in additional_tags:
+            cleaned_text = re.sub(
+                f'<{tag}>(.*?)(?=<(?:{"|".join(additional_tags)})>|$)',
+                f'<{tag}>\\1</{tag}>',
+                cleaned_text,
+                flags=re.DOTALL
+            )
+
+        return cleaned_text.replace('(scratchpad)', '').strip()
+
+
+
+
 def main(seed: int = 42, is_local: bool = False) -> None:
     """
     Generate Q&A content based on input text from input_text.txt using the specified LLM backend.
@@ -322,4 +365,4 @@ def main(seed: int = 42, is_local: bool = False) -> None:
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/podcastfy/text_to_speech.py b/podcastfy/text_to_speech.py
@@ -80,17 +80,22 @@ def convert_to_speech(self, text: str, output_file: str) -> None:
         Args:
                 text (str): Input text to convert to speech.
                 output_file (str): Path to save the output audio file.
+                
+        Raises:
+            ValueError: If the input text is not properly formatted
         """
-
-        # Then clean up TSS markup
+        # Validate transcript format
+        #self._validate_transcript_format(text)
+
+        # Clean up TSS markup
         cleaned_text = text
-
+        
         try:
             with tempfile.TemporaryDirectory(dir=self.temp_audio_dir) as temp_dir:
                 audio_segments = self._generate_audio_segments(cleaned_text, temp_dir)
                 self._merge_audio_files(audio_segments, output_file)
                 logger.info(f"Audio saved to {output_file}")
-
+                
         except Exception as e:
             logger.error(f"Error converting text to speech: {str(e)}")
             raise
@@ -201,6 +206,73 @@ def _setup_directories(self) -> None:
             if dir_path and not os.path.exists(dir_path):
                 os.makedirs(dir_path)
 
+    def _validate_transcript_format(self, text: str) -> None:
+        """
+        Validate that the input text follows the correct transcript format.
+        
+        Args:
+            text (str): Input text to validate
+            
+        Raises:
+            ValueError: If the text is not properly formatted
+            
+        The text should:
+        1. Have alternating Person1 and Person2 tags
+        2. Each opening tag should have a closing tag
+        3. Tags should be properly nested
+        """
+        try:
+            # Check for empty text
+            if not text.strip():
+                raise ValueError("Input text is empty")
+
+            # Check for matching opening and closing tags
+            person1_open = text.count("<Person1>")
+            person1_close = text.count("</Person1>")
+            person2_open = text.count("<Person2>")
+            person2_close = text.count("</Person2>")
+
+            if person1_open != person1_close:
+                raise ValueError(f"Mismatched Person1 tags: {person1_open} opening tags and {person1_close} closing tags")
+            if person2_open != person2_close:
+                raise ValueError(f"Mismatched Person2 tags: {person2_open} opening tags and {person2_close} closing tags")
+
+            # Check for alternating pattern using regex
+            pattern = r"<Person1>.*?</Person1>\s*<Person2>.*?</Person2>"
+            matches = re.findall(pattern, text, re.DOTALL)
+
+            # Calculate expected number of pairs
+            expected_pairs = min(person1_open, person2_open)
+
+            if len(matches) != expected_pairs:
+                raise ValueError(
+                    "Tags are not properly alternating between Person1 and Person2. "
+                    "Each Person1 section should be followed by a Person2 section."
+                )
+
+            # Check for malformed tags (unclosed or improperly nested)
+                stack = []
+                for match in re.finditer(r"<(/?)Person([12])>", text):
+                    tag = match.group(0)
+                    if tag.startswith("</"):
+                        if not stack or stack[-1] != tag[2:-1]:
+                            raise ValueError(f"Improperly nested tags near: {tag}")
+                        stack.pop()
+                    else:
+                        stack.append(tag[1:-1])
+
+                if stack:
+                    raise ValueError(f"Unclosed tags: {', '.join(stack)}")
+
+            logger.debug("Transcript format validation passed")
+
+        except ValueError as e:
+            logger.error(f"Transcript format validation failed: {str(e)}")
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error during transcript validation: {str(e)}")
+            raise ValueError(f"Invalid transcript format: {str(e)}")
+
 
 def main(seed: int = 42) -> None:
     """

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "podcastfy"
-version = "0.2.16"
+version = "0.2.17"
 description = "An Open Source alternative to NotebookLM's podcast feature: Transforming Multimodal Content into Captivating Multilingual Audio Conversations with GenAI"
 authors = ["Tharsis T. P. Souza"]
 license = "Apache-2.0"

diff --git a/tests/test_client.py b/tests/test_client.py
@@ -148,7 +148,7 @@ def test_generate_transcript_only(sample_config):
             for tag in re.findall(r"<Person2>.*?</Person2>", content)
         )
 
-
+@pytest.mark.skip(reason="Not supported yet")
 def test_generate_podcast_from_urls_and_file(mock_files, sample_config):
     result = runner.invoke(
         app,