Skip to content

Commit

Permalink
Hide TTS filename behind random token (#131192)
Browse files Browse the repository at this point in the history
* Hide TTS filename behind random token

* Clean up and fix test snapshots

* Fix tests

* Fix cloud tests
  • Loading branch information
synesthesiam authored Nov 25, 2024
1 parent cb4636a commit d4071e7
Show file tree
Hide file tree
Showing 7 changed files with 701 additions and 664 deletions.
24 changes: 22 additions & 2 deletions homeassistant/components/tts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import mimetypes
import os
import re
import secrets
import subprocess
import tempfile
from typing import Any, Final, TypedDict, final
Expand Down Expand Up @@ -540,6 +541,10 @@ def __init__(
self.file_cache: dict[str, str] = {}
self.mem_cache: dict[str, TTSCache] = {}

# filename <-> token
self.filename_to_token: dict[str, str] = {}
self.token_to_filename: dict[str, str] = {}

def _init_cache(self) -> dict[str, str]:
"""Init cache folder and fetch files."""
try:
Expand Down Expand Up @@ -656,7 +661,17 @@ async def async_get_url_path(
engine_instance, cache_key, message, use_cache, language, options
)

return f"/api/tts_proxy/{filename}"
# Use a randomly generated token instead of exposing the filename
token = self.filename_to_token.get(filename)
if not token:
# Keep extension (.mp3, etc.)
token = secrets.token_urlsafe(16) + os.path.splitext(filename)[1]

# Map token <-> filename
self.filename_to_token[filename] = token
self.token_to_filename[token] = filename

return f"/api/tts_proxy/{token}"

async def async_get_tts_audio(
self,
Expand Down Expand Up @@ -910,11 +925,15 @@ def async_remove_from_mem(_: datetime) -> None:
),
)

async def async_read_tts(self, filename: str) -> tuple[str | None, bytes]:
async def async_read_tts(self, token: str) -> tuple[str | None, bytes]:
"""Read a voice file and return binary.
This method is a coroutine.
"""
filename = self.token_to_filename.get(token)
if not filename:
raise HomeAssistantError(f"{token} was not recognized!")

if not (record := _RE_VOICE_FILE.match(filename.lower())) and not (
record := _RE_LEGACY_VOICE_FILE.match(filename.lower())
):
Expand Down Expand Up @@ -1076,6 +1095,7 @@ def __init__(self, tts: SpeechManager) -> None:
async def get(self, request: web.Request, filename: str) -> web.Response:
"""Start a get request."""
try:
# filename is actually token, but we keep its name for compatibility
content, data = await self.tts.async_read_tts(filename)
except HomeAssistantError as err:
_LOGGER.error("Error on load tts: %s", err)
Expand Down
8 changes: 4 additions & 4 deletions tests/components/assist_pipeline/snapshots/test_init.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
}),
'type': <PipelineEventType.TTS_END: 'tts-end'>,
Expand Down Expand Up @@ -166,7 +166,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22Arnold+Schwarzenegger%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_2657c1a8ee_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
}),
'type': <PipelineEventType.TTS_END: 'tts-end'>,
Expand Down Expand Up @@ -255,7 +255,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22Arnold+Schwarzenegger%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_2657c1a8ee_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
}),
'type': <PipelineEventType.TTS_END: 'tts-end'>,
Expand Down Expand Up @@ -368,7 +368,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
}),
'type': <PipelineEventType.TTS_END: 'tts-end'>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
})
# ---
Expand Down Expand Up @@ -154,7 +154,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
})
# ---
Expand Down Expand Up @@ -247,7 +247,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
})
# ---
Expand Down Expand Up @@ -350,7 +350,7 @@
'tts_output': dict({
'media_id': "media-source://tts/test?message=Sorry,+I+couldn't+understand+that&language=en-US&tts_options=%7B%22voice%22:%22james_earl_jones%22%7D",
'mime_type': 'audio/mpeg',
'url': '/api/tts_proxy/dae2cdcb27a1d1c3b07ba2c7db91480f9d4bfd8f_en-us_031e2ec052_test.mp3',
'url': '/api/tts_proxy/test_token.mp3',
}),
})
# ---
Expand Down
148 changes: 80 additions & 68 deletions tests/components/assist_pipeline/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,24 @@ async def audio_data():
yield make_10ms_chunk(b"part2")
yield b""

await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)
with patch(
"homeassistant.components.tts.secrets.token_urlsafe", return_value="test_token"
):
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)

assert process_events(events) == snapshot
assert len(mock_stt_provider_entity.received) == 2
Expand Down Expand Up @@ -133,23 +136,26 @@ async def audio_data():
assert msg["success"]
pipeline_id = msg["result"]["id"]

# Use the created pipeline
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="en-UK",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
pipeline_id=pipeline_id,
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)
with patch(
"homeassistant.components.tts.secrets.token_urlsafe", return_value="test_token"
):
# Use the created pipeline
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="en-UK",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
pipeline_id=pipeline_id,
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)

assert process_events(events) == snapshot
assert len(mock_stt_provider.received) == 2
Expand Down Expand Up @@ -198,23 +204,26 @@ async def audio_data():
assert msg["success"]
pipeline_id = msg["result"]["id"]

# Use the created pipeline
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="en-UK",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
pipeline_id=pipeline_id,
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)
with patch(
"homeassistant.components.tts.secrets.token_urlsafe", return_value="test_token"
):
# Use the created pipeline
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="en-UK",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
pipeline_id=pipeline_id,
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)

assert process_events(events) == snapshot
assert len(mock_stt_provider_entity.received) == 2
Expand Down Expand Up @@ -362,25 +371,28 @@ async def audio_data():

yield b""

await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
start_stage=assist_pipeline.PipelineStage.WAKE_WORD,
wake_word_settings=assist_pipeline.WakeWordSettings(
audio_seconds_to_buffer=1.5
),
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)
with patch(
"homeassistant.components.tts.secrets.token_urlsafe", return_value="test_token"
):
await assist_pipeline.async_pipeline_from_audio_stream(
hass,
context=Context(),
event_callback=events.append,
stt_metadata=stt.SpeechMetadata(
language="",
format=stt.AudioFormats.WAV,
codec=stt.AudioCodecs.PCM,
bit_rate=stt.AudioBitRates.BITRATE_16,
sample_rate=stt.AudioSampleRates.SAMPLERATE_16000,
channel=stt.AudioChannels.CHANNEL_MONO,
),
stt_stream=audio_data(),
start_stage=assist_pipeline.PipelineStage.WAKE_WORD,
wake_word_settings=assist_pipeline.WakeWordSettings(
audio_seconds_to_buffer=1.5
),
audio_settings=assist_pipeline.AudioSettings(is_vad_enabled=False),
)

assert process_events(events) == snapshot

Expand Down
Loading

0 comments on commit d4071e7

Please sign in to comment.