Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@
"rubix/ml": "2.5.3",
"sabberworm/php-css-parser": "^8.9.0",
"wamania/php-stemmer": "4.0 as 3.0",
"youthweb/urllinker": "^2.1.0"
"youthweb/urllinker": "^2.1.0",
"zbateson/mb-wrapper": "^2.0"
},
"provide": {
"psr/log": "^1.0.4|^2|^3"
Expand Down
155 changes: 154 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions lib/AppInfo/Application.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
use OCA\Mail\HordeTranslationHandler;
use OCA\Mail\Http\Middleware\ErrorMiddleware;
use OCA\Mail\Http\Middleware\ProvisioningMiddleware;
use OCA\Mail\IMAP\Charset\Converter;
use OCA\Mail\Listener\AccountSynchronizedThreadUpdaterListener;
use OCA\Mail\Listener\AddressCollectionListener;
use OCA\Mail\Listener\DeleteDraftListener;
Expand Down Expand Up @@ -85,6 +86,7 @@
use OCP\User\Events\UserDeletedEvent;
use OCP\Util;
use Psr\Container\ContainerInterface;
use ZBateson\MbWrapper\MbWrapper;

include_once __DIR__ . '/../../vendor/autoload.php';

Expand Down Expand Up @@ -116,6 +118,13 @@ public function register(IRegistrationContext $context): void {
return $favicon;
});

$context->registerService(MbWrapper::class, function (ContainerInterface $c) {
return new MbWrapper();
});
$context->registerService(Converter::class, function (ContainerInterface $c) {
return new Converter($c->get(MbWrapper::class));
});

$context->registerServiceAlias(IAvatarService::class, AvatarService::class);
$context->registerServiceAlias(IAttachmentService::class, AttachmentService::class);
$context->registerServiceAlias(IMailManager::class, MailManager::class);
Expand Down
106 changes: 70 additions & 36 deletions lib/IMAP/Charset/Converter.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,63 +11,97 @@

use Horde_Mime_Part;
use OCA\Mail\Exception\ServiceException;
use function in_array;
use ZBateson\MbWrapper\MbWrapper;
use ZBateson\MbWrapper\UnsupportedCharsetException;
use function is_string;

class Converter {
/**
* Prioritized charsets used for detection if header is missing/wrong.
* This list can be expanded/tweaked based on userbase/email sources/field experience.
*/
private const DETECTION_CHARSETS = [
'UTF-8',
'WINDOWS-1252',
'ISO-8859-1',
'ISO-8859-15',
'ISO-8859-2',
// Add locale/userbase-specific encodings as needed.
// TODO: Make configurable and/or dynamically tailor based on user's locale/language to improve accuracy.
];

private MbWrapper $mbWrapper;

public function __construct(?MbWrapper $mbWrapper = null) {
$this->mbWrapper = $mbWrapper ?: new MbWrapper();
}

/**
* @param Horde_Mime_Part $p
* @return string
* @throws ServiceException
* Converts the contents of a MIME part to UTF-8 using charset normalization,
* detection, and fallback logic for email compatibility.
*
* @param Horde_Mime_Part $p The MIME part to convert.
* @return string The UTF-8 encoded content.
* @throws ServiceException If charset detection or conversion fails.
*/
public function convert(Horde_Mime_Part $p): string {
/** @var null|string $data */
$data = $p->getContents();
if ($data === null) {
if (!is_string($data) || $data === '') {
return '';
}

// Only convert encoding if it is explicitly specified in the header because text/calendar
// data is utf-8 by default.
$charset = $p->getCharset();
if ($charset !== null && strtoupper($charset) === 'UTF-8') {
return $data;
}

// The part specifies a charset
if ($charset !== null) {
if (in_array($charset, mb_list_encodings(), true)) {
$converted = mb_convert_encoding($data, 'UTF-8', $charset);
} else {
$converted = iconv($charset, 'UTF-8', $data);
}

if (is_string($converted)) {
return $converted;
// Try header-declared charset first, if any.
//
// We always do one conversion attempt even if UTF-8 is indicated (and before detection) since:
// - headers can lie
// - some encodings may pass as "valid" UTF-8 by accident
// - we want to surface problems
if ($charset !== null && $charset !== '') {
try {
return $this->mbWrapper->convert($data, $charset, 'UTF-8');
} catch (UnsupportedCharsetException $e) {
// fall through to detection & fallback
}
}

// No charset specified, let's ask mb if this could be UTF-8
$detectedCharset = mb_detect_encoding($data, 'UTF-8', true);
if ($detectedCharset === false) {
// Fallback, non UTF-8
$detectedCharset = mb_detect_encoding($data, null, true);
}
// Still UTF8, no need to convert
if ($detectedCharset !== false && strtoupper($detectedCharset) === 'UTF-8') {
// If already valid UTF-8, return as-is
if ($this->mbWrapper->checkEncoding($data, 'UTF-8')) {
return $data;
}

$converted = @mb_convert_encoding($data, 'UTF-8', $charset);
if ($converted === false) {
// Might be a charset that PHP mb doesn't know how to handle, fall back to iconv
$converted = iconv($charset, 'UTF-8', $data);
// Try prioritised detection list
$detectedCharset = mb_detect_encoding($data, self::DETECTION_CHARSETS, true);
if ($detectedCharset !== false && strtoupper($detectedCharset) !== 'UTF-8') {
try {
return $this->mbWrapper->convert($data, $detectedCharset, 'UTF-8');
} catch (UnsupportedCharsetException $e) {
// fall through
}
}

// Try most common Western fallback charsets manually
$fallbacks = ['WINDOWS-1252', 'ISO-8859-1'];
foreach ($fallbacks as $fallbackCharset) {
try {
return $this->mbWrapper->convert($data, $fallbackCharset, 'UTF-8');
} catch (UnsupportedCharsetException $e) {
// continue
}
}

if (!is_string($converted)) {
throw new ServiceException('Could not detect message charset');
// If nothing succeeded, throw a rich exception for debugging
$head = is_string($data) ? $data : var_export($data, true); // better safe than sorry

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable32

TypeDoesNotContainType

lib/IMAP/Charset/Converter.php:95:38: TypeDoesNotContainType: Type non-empty-string for $data is always !string (see https://psalm.dev/056)

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable32

RedundantCondition

lib/IMAP/Charset/Converter.php:95:11: RedundantCondition: Type non-empty-string for $data is always string (see https://psalm.dev/122)

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable31

TypeDoesNotContainType

lib/IMAP/Charset/Converter.php:95:38: TypeDoesNotContainType: Type non-empty-string for $data is always !string (see https://psalm.dev/056)

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-stable31

RedundantCondition

lib/IMAP/Charset/Converter.php:95:11: RedundantCondition: Type non-empty-string for $data is always string (see https://psalm.dev/122)

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-master

TypeDoesNotContainType

lib/IMAP/Charset/Converter.php:95:38: TypeDoesNotContainType: Type non-empty-string for $data is always !string (see https://psalm.dev/056)

Check failure on line 95 in lib/IMAP/Charset/Converter.php

View workflow job for this annotation

GitHub Actions / static-psalm-analysis dev-master

RedundantCondition

lib/IMAP/Charset/Converter.php:95:11: RedundantCondition: Type non-empty-string for $data is always string (see https://psalm.dev/122)
$head = preg_replace('/[^\x20-\x7E\n\r\t]/', '?', $head); // binary/non-printable characters
if (mb_strlen($head) > 40) { // truncate to a sample of $data
$head = mb_substr($head, 0, 40) . '...';
}
return $converted;
throw new ServiceException(sprintf(
'Could not detect or convert message charset (input type: %s, charset: %s, head: %s)',
gettype($data),
var_export($charset, true),
$head
));
}
}
Loading