diff --git a/composer.json b/composer.json index 9c6205e7c2..6cce0dab8f 100644 --- a/composer.json +++ b/composer.json @@ -42,7 +42,8 @@ "rubix/ml": "2.5.3", "sabberworm/php-css-parser": "^8.9.0", "wamania/php-stemmer": "4.0 as 3.0", - "youthweb/urllinker": "^2.1.0" + "youthweb/urllinker": "^2.1.0", + "zbateson/mb-wrapper": "^2.0" }, "provide": { "psr/log": "^1.0.4|^2|^3" diff --git a/composer.lock b/composer.lock index c59799fc95..9ea77b80c0 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "4f7575d9c8725cf45f05496c9802bc46", + "content-hash": "b0e7467092557ccd58c72f128ba42b26", "packages": [ { "name": "amphp/amp", @@ -2753,6 +2753,90 @@ ], "time": "2024-09-25T14:21:43+00:00" }, + { + "name": "symfony/polyfill-iconv", + "version": "v1.33.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-iconv.git", + "reference": "5f3b930437ae03ae5dff61269024d8ea1b3774aa" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-iconv/zipball/5f3b930437ae03ae5dff61269024d8ea1b3774aa", + "reference": "5f3b930437ae03ae5dff61269024d8ea1b3774aa", + "shasum": "" + }, + "require": { + "php": ">=7.2" + }, + "provide": { + "ext-iconv": "*" + }, + "suggest": { + "ext-iconv": "For best performance" + }, + "type": "library", + "extra": { + "thanks": { + "url": "https://github.com/symfony/polyfill", + "name": "symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Iconv\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for the Iconv extension", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "iconv", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-iconv/tree/v1.33.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://github.com/nicolas-grekas", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2024-09-17T14:58:18+00:00" + }, { "name": "symfony/polyfill-mbstring", "version": "v1.31.0", @@ -3165,6 +3249,75 @@ "source": "https://github.com/Art4/urllinker/tree/2.1.0" }, "time": "2025-07-22T10:44:28+00:00" + }, + { + "name": "zbateson/mb-wrapper", + "version": "2.0.1", + "source": { + "type": "git", + "url": "https://github.com/zbateson/mb-wrapper.git", + "reference": "50a14c0c9537f978a61cde9fdc192a0267cc9cff" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/zbateson/mb-wrapper/zipball/50a14c0c9537f978a61cde9fdc192a0267cc9cff", + "reference": "50a14c0c9537f978a61cde9fdc192a0267cc9cff", + "shasum": "" + }, + "require": { + "php": ">=8.0", + "symfony/polyfill-iconv": "^1.9", + "symfony/polyfill-mbstring": "^1.9" + }, + "require-dev": { + "friendsofphp/php-cs-fixer": "*", + "phpstan/phpstan": "*", + "phpunit/phpunit": "^9.6|^10.0" + }, + "suggest": { + "ext-iconv": "For best support/performance", + "ext-mbstring": "For best support/performance" + }, + "type": "library", + "autoload": { + "psr-4": { + "ZBateson\\MbWrapper\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-2-Clause" + ], + "authors": [ + { + "name": "Zaahid Bateson" + } + ], + "description": "Wrapper for mbstring with fallback to iconv for encoding conversion and string manipulation", + "keywords": [ + "charset", + "encoding", + "http", + "iconv", + "mail", + "mb", + "mb_convert_encoding", + "mbstring", + "mime", + "multibyte", + "string" + ], + "support": { + "issues": "https://github.com/zbateson/mb-wrapper/issues", + "source": "https://github.com/zbateson/mb-wrapper/tree/2.0.1" + }, + "funding": [ + { + "url": "https://github.com/zbateson", + "type": "github" + } + ], + "time": "2024-12-20T22:05:33+00:00" } ], "packages-dev": [ diff --git a/lib/AppInfo/Application.php b/lib/AppInfo/Application.php index 6eba519334..90952375c6 100644 --- a/lib/AppInfo/Application.php +++ b/lib/AppInfo/Application.php @@ -35,6 +35,7 @@ use OCA\Mail\HordeTranslationHandler; use OCA\Mail\Http\Middleware\ErrorMiddleware; use OCA\Mail\Http\Middleware\ProvisioningMiddleware; +use OCA\Mail\IMAP\Charset\Converter; use OCA\Mail\Listener\AccountSynchronizedThreadUpdaterListener; use OCA\Mail\Listener\AddressCollectionListener; use OCA\Mail\Listener\DeleteDraftListener; @@ -85,6 +86,7 @@ use OCP\User\Events\UserDeletedEvent; use OCP\Util; use Psr\Container\ContainerInterface; +use ZBateson\MbWrapper\MbWrapper; include_once __DIR__ . '/../../vendor/autoload.php'; @@ -116,6 +118,13 @@ public function register(IRegistrationContext $context): void { return $favicon; }); + $context->registerService(MbWrapper::class, function (ContainerInterface $c) { + return new MbWrapper(); + }); + $context->registerService(Converter::class, function (ContainerInterface $c) { + return new Converter($c->get(MbWrapper::class)); + }); + $context->registerServiceAlias(IAvatarService::class, AvatarService::class); $context->registerServiceAlias(IAttachmentService::class, AttachmentService::class); $context->registerServiceAlias(IMailManager::class, MailManager::class); diff --git a/lib/IMAP/Charset/Converter.php b/lib/IMAP/Charset/Converter.php index 9fd11f7d93..4dfa026dd0 100644 --- a/lib/IMAP/Charset/Converter.php +++ b/lib/IMAP/Charset/Converter.php @@ -11,63 +11,97 @@ use Horde_Mime_Part; use OCA\Mail\Exception\ServiceException; -use function in_array; +use ZBateson\MbWrapper\MbWrapper; +use ZBateson\MbWrapper\UnsupportedCharsetException; use function is_string; class Converter { + /** + * Prioritized charsets used for detection if header is missing/wrong. + * This list can be expanded/tweaked based on userbase/email sources/field experience. + */ + private const DETECTION_CHARSETS = [ + 'UTF-8', + 'WINDOWS-1252', + 'ISO-8859-1', + 'ISO-8859-15', + 'ISO-8859-2', + // Add locale/userbase-specific encodings as needed. + // TODO: Make configurable and/or dynamically tailor based on user's locale/language to improve accuracy. + ]; + + private MbWrapper $mbWrapper; + + public function __construct(?MbWrapper $mbWrapper = null) { + $this->mbWrapper = $mbWrapper ?: new MbWrapper(); + } /** - * @param Horde_Mime_Part $p - * @return string - * @throws ServiceException + * Converts the contents of a MIME part to UTF-8 using charset normalization, + * detection, and fallback logic for email compatibility. + * + * @param Horde_Mime_Part $p The MIME part to convert. + * @return string The UTF-8 encoded content. + * @throws ServiceException If charset detection or conversion fails. */ public function convert(Horde_Mime_Part $p): string { /** @var null|string $data */ $data = $p->getContents(); - if ($data === null) { + if (!is_string($data) || $data === '') { return ''; } - // Only convert encoding if it is explicitly specified in the header because text/calendar - // data is utf-8 by default. $charset = $p->getCharset(); - if ($charset !== null && strtoupper($charset) === 'UTF-8') { - return $data; - } - - // The part specifies a charset - if ($charset !== null) { - if (in_array($charset, mb_list_encodings(), true)) { - $converted = mb_convert_encoding($data, 'UTF-8', $charset); - } else { - $converted = iconv($charset, 'UTF-8', $data); - } - - if (is_string($converted)) { - return $converted; + // Try header-declared charset first, if any. + // + // We always do one conversion attempt even if UTF-8 is indicated (and before detection) since: + // - headers can lie + // - some encodings may pass as "valid" UTF-8 by accident + // - we want to surface problems + if ($charset !== null && $charset !== '') { + try { + return $this->mbWrapper->convert($data, $charset, 'UTF-8'); + } catch (UnsupportedCharsetException $e) { + // fall through to detection & fallback } } - // No charset specified, let's ask mb if this could be UTF-8 - $detectedCharset = mb_detect_encoding($data, 'UTF-8', true); - if ($detectedCharset === false) { - // Fallback, non UTF-8 - $detectedCharset = mb_detect_encoding($data, null, true); - } - // Still UTF8, no need to convert - if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') { + // If already valid UTF-8, return as-is + if ($this->mbWrapper->checkEncoding($data, 'UTF-8')) { return $data; } - $converted = @mb_convert_encoding($data, 'UTF-8', $charset); - if ($converted === false) { - // Might be a charset that PHP mb doesn't know how to handle, fall back to iconv - $converted = iconv($charset, 'UTF-8', $data); + // Try prioritised detection list + $detectedCharset = mb_detect_encoding($data, self::DETECTION_CHARSETS, true); + if ($detectedCharset !== false && strtoupper($detectedCharset) !== 'UTF-8') { + try { + return $this->mbWrapper->convert($data, $detectedCharset, 'UTF-8'); + } catch (UnsupportedCharsetException $e) { + // fall through + } + } + + // Try most common Western fallback charsets manually + $fallbacks = ['WINDOWS-1252', 'ISO-8859-1']; + foreach ($fallbacks as $fallbackCharset) { + try { + return $this->mbWrapper->convert($data, $fallbackCharset, 'UTF-8'); + } catch (UnsupportedCharsetException $e) { + // continue + } } - if (!is_string($converted)) { - throw new ServiceException('Could not detect message charset'); + // If nothing succeeded, throw a rich exception for debugging + $head = is_string($data) ? $data : var_export($data, true); // better safe than sorry + $head = preg_replace('/[^\x20-\x7E\n\r\t]/', '?', $head); // binary/non-printable characters + if (mb_strlen($head) > 40) { // truncate to a sample of $data + $head = mb_substr($head, 0, 40) . '...'; } - return $converted; + throw new ServiceException(sprintf( + 'Could not detect or convert message charset (input type: %s, charset: %s, head: %s)', + gettype($data), + var_export($charset, true), + $head + )); } }