From a2c7c44d924ec3522bfc1040e054027aa01866bf Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 4 Oct 2025 04:32:16 +0200 Subject: [PATCH 1/2] unicodeTrim: Combine suffix and prefix matching --- Mf2/Parser.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index c7cc02d..f9a444b 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -113,8 +113,7 @@ function collapseWhitespace($str) { function unicodeTrim($str) { // this is cheating. TODO: find a better way if this causes any problems $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); - $str = preg_replace('/^\s+/', '', $str); - return preg_replace('/\s+$/', '', $str); + return preg_replace('/^\s+|\s+$/', '', $str); } /** From 7e990a6701aea25263550bf74c6ff40928cecd78 Mon Sep 17 00:00:00 2001 From: Jan Tojnar Date: Sat, 4 Oct 2025 04:30:32 +0200 Subject: [PATCH 2/2] unicodeTrim: Fix PHP 8.2 deprecation mbstring extension in PHP 8.2 deprecates `HTML-ENTITIES` encoding: https://php.watch/versions/8.2/mbstring-qprint-base64-uuencode-html-entities-deprecated However, there is no need to use it to the UTF-8 representation of NBSP, one can just directly use `\u{0a}` (or `\xc2\x0a` for PHP < 7.0). Or, even better, we can enable `PCRE_UTF8` mode: https://www.php.net/manual/en/reference.pcre.pattern.modifiers.php That will remove all Unicode whitespace characters, not just the ASCII ones and nbsp because `u` modifier in PHP enables `PCRE_UCP` as well `PCRE_UTF` options: https://github.com/php/doc-en/issues/2831 It is supposed to be available since PHP 5.1: https://www.phpbb.com/community/viewtopic.php?t=733515 --- Mf2/Parser.php | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Mf2/Parser.php b/Mf2/Parser.php index f9a444b..518efbd 100644 --- a/Mf2/Parser.php +++ b/Mf2/Parser.php @@ -111,9 +111,7 @@ function collapseWhitespace($str) { } function unicodeTrim($str) { - // this is cheating. TODO: find a better way if this causes any problems - $str = str_replace(mb_convert_encoding(' ', 'UTF-8', 'HTML-ENTITIES'), ' ', $str); - return preg_replace('/^\s+|\s+$/', '', $str); + return preg_replace('/^\s+|\s+$/u', '', $str); } /**