Update to use tables for case

JuliaString · Oct 17, 2018 · 3163363 · 3163363
1 parent bc78697
commit 3163363
Show file tree

Hide file tree

Showing 8 changed files with 648 additions and 147 deletions.
diff --git a/src/StrBase.jl b/src/StrBase.jl
@@ -48,6 +48,7 @@ include("types.jl")
 @static V6_COMPAT && include("compat.jl")
 @static NEW_ITERATE && include("fixparse.jl")
 include("chars.jl")
+include("charcase.jl")
 include("access.jl")
 include("traits.jl")
 include("utf8proc.jl")

diff --git a/src/casefold.jl b/src/casefold.jl
@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
 Licensed under MIT License, see LICENSE.md
 =#
 
-_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
-
-@inline _wide_lower_ch(ch) =
-    ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch))
-
-@inline _isupper_ch(ch) =
-    ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch))
-
-_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
-
-_wide_out_upper(ch) =
-    ifelse(ch == 0xb5, 0x39c,
-           ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
-
-
 function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
     (len = ncodeunits(str)) == 0 && return str
     @preserve str begin
         pnt = pointer(str)
         ch = get_codeunit(pnt)
         _islower_a(ch) || return str
-        out = _allocate(len)
+        buf, out = _allocate(UInt8, len)
         unsafe_copyto!(out, pnt, len)
         set_codeunit!(out, ch - 0x20)
-        Str(C, out)
+        Str(C, buf)
     end
 end
 
@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
         pnt = pointer(str)
         ch = get_codeunit(pnt)
         _isupper_a(ch) || return str
-        out = _allocate(len)
+        buf, out = _allocate(UInt8, len)
         unsafe_copyto!(out, pnt, len)
         set_codeunit!(out, ch + 0x20)
-        Str(C, out)
+        Str(C, buf)
     end
 end
 
@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
         _can_upper(ch) || return str
         buf, out = _allocate(UInt8, len)
         set_codeunit!(out, ch - 0x20)
-        len > 1 && unsafe_copyto!(out, pnt+1, len-1)
+        len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1)
         Str(C, buf)
     end
 end
@@ -154,10 +139,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
     @preserve str begin
         pnt = pointer(str)
         ch = get_codeunit(pnt)
-        _isupper(ch) || return str
+        _isupper_al(ch) || return str
         buf, out = _allocate(UInt8, len)
         set_codeunit!(out, ch + 0x20)
-        len > 1 && unsafe_copyto!(out, pnt+1, len-1)
+        len > 1 && unsafe_copyto!(out+1, pnt+1, len-1)
         Str(C, buf)
     end
 end
@@ -176,7 +161,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE}
     out += off
     while out < fin
         ch = get_codeunit(out)
-        _can_upper(ch) && set_codeunit!(out, ch - 0x20)
+        _islower(ch) && set_codeunit!(out, ch - 0x20)
         out += 1
     end
     Str(C, buf)
@@ -264,7 +249,7 @@ end
 # result must have at least one character > 0xff, so if the only character(s)
 # > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
 
-function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
+function _lower(::Type{C}, beg, off, len) where {C<:Union{_UCS2CSE}}
     CU = codeunit(C)
     buf, out = _allocate(CU, len)
     unsafe_copyto!(out, beg, len)
@@ -277,18 +262,20 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
             _isupper_a(ch) && set_codeunit!(out, ch += 0x20)
         elseif ch <= 0xff
             _isupper_l(ch) && set_codeunit!(out, ch += 0x20)
-        elseif _isupper_u(ch)
-            ch = _lowercase_u(ch)
-            flg = ch <= 0xff
-            set_codeunit!(out, ch)
+        elseif ch <= 0xffff
+            if _can_lower_bmp(ch)
+                ch = _lower_bmp(ch)
+                flg = ch <= 0xff
+                set_codeunit!(out, ch)
+            end
         end
         out += sizeof(CU)
     end
     if flg && is_latin(buf)
         out = pointer(buf)
-        buf = _allocate(len)
-        _narrow!(pointer(buf), out, out + len)
-        Str(_LatinCSE, buf)
+        buf8 = _allocate(len)
+        _narrow!(pointer(buf8), out, out + len)
+        Str(_LatinCSE, buf8)
     else
         Str(C, buf)
     end
@@ -302,25 +289,74 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
     out += off
     while out < fin
         ch = get_codeunit(out)
-        if ch <= 0x7f
-            _isupper_a(ch) && set_codeunit!(out, ch += 0x20)
-        elseif ch <= 0xff
-            _isupper_l(ch) && set_codeunit!(out, ch += 0x20)
-        elseif _isupper_u(ch)
-            set_codeunit!(out, _lowercase_u(ch))
+        if ch <= 0xff
+            _isupper_al(ch) && set_codeunit!(out, ch += 0x20)
+        elseif ch <= 0xffff
+            _can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch))
+        elseif ch <= 0x1ffff
+            _can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch))
         end
         out += sizeof(CU)
     end
     Str(C, buf)
 end
 
+function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        (ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) :
+         ch <= 0x1ffff && _can_lower_slp(ch)) ||
+         return str
+        cl = _lower_ch(ch)
+        if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16))
+            buf8, out8 = _allocate(UInt8, len)
+            len > 1 && _narrow!(out8 + 1, pnt + 1, pnt + len - 1)
+            set_codeunit!(out8, cl)
+            Str(_LatinCSE, buf8)
+        else
+            buf, out = _allocate(codeunit(C), len)
+            len > 1 && unsafe_copyto!(out, pnt, len)
+            set_codeunit!(out, cl)
+            Str(C, buf)
+        end
+    end
+end
+
+function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        _can_title_ch(ch) || return str
+        buf, out = _allocate(codeunit(C), len)
+        len > 1 && unsafe_copyto!(out, pnt, len)
+        set_codeunit!(out, _title_ch(ch))
+        Str(C, buf)
+    end
+end
+
+function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        _can_lower_ch(ch) || return str
+        buf, out = _allocate(codeunit(C), len)
+        len > 1 && unsafe_copyto!(out, pnt, len)
+        set_codeunit!(out, _lower_ch(ch))
+        Str(C, buf)
+    end
+end
+
 function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
     @preserve str begin
         CU = codeunit(C)
         pnt = beg = pointer(str)
         fin = beg + sizeof(str)
         while pnt < fin
-            _isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
+            _can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
             pnt += sizeof(CU)
         end
     end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
         ch = get_codeunit(out)
         if ch <= 0x7f
             _islower_a(ch) && set_codeunit!(out, ch -= 0x20)
-        elseif ch > 0xff
-            _islower_u(ch) && set_codeunit!(out, _uppercase_u(ch))
-        elseif _can_upper(ch)
-            set_codeunit!(out, ch -= 0x20)
-        elseif ch == 0xb5
-            set_codeunit!(out, 0x39c)
-        elseif ch == 0xff
-            set_codeunit!(out, 0x178)
-        elseif !V6_COMPAT && ch == 0xdf
-            set_codeunit!(out, 0x1e9e)
+        elseif ch <= 0xff
+            set_codeunit!(out, _uppercase_l(ch))
+        elseif ch <= 0xffff
+            _can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch))
+        elseif ch <= 0x1ffff
+            _can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch))
         end
         out += sizeof(CU)
     end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
         pnt = beg = pointer(str)
         fin = beg + sizeof(str)
         while pnt < fin
-            _wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
+            _can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
             pnt += sizeof(CU)
         end
         str

diff --git a/src/charcase.jl b/src/charcase.jl
@@ -0,0 +1,130 @@
+#=
+Case folding for Unicode characters
+
+Copyright 2018 Gandalf Software, Inc., Scott P. Jones
+Licensed under MIT License, see LICENSE.md
+=#
+
+module CaseTables
+include("maketables.jl")
+
+const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
+end # module CaseTables
+
+using .CaseTables
+
+const ct = CaseTables.ct
+
+using ModuleInterfaceTools
+@api extend ChrBase
+
+_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
+
+_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
+
+_wide_out_upper(ch) =
+    ifelse(ch == 0xb5, 0x39c,
+           ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
+
+_check_tab(off, ch) =
+    off != 0 && (CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0
+
+@inline _get_tab(off, ch) =
+    off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch :
+    CaseTables.tupvec[off][(ch & 0x1f) + 1]
+
+@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch)
+
+@inline _upper_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))
+
+@inline _lower_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_l >>> t) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))
+
+@inline _title_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.t_tab[(t>>1)+1], ch))
+
+@inline _upper_slp(ch) =
+    (t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))
+
+@inline _lower_slp(ch) =
+    (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))
+
+# Handle range 0x0000-0xffff
+@inline _can_lower_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
+
+# Handle range 0x10000-0x1ffff
+@inline _can_lower_slp(ch) =
+    (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
+
+# Handle range 0x0000-0xffff
+@inline _can_upper_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
+
+# Handle range 0x10000-0x1ffff
+@inline _can_upper_slp(ch) =
+    (t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
+
+#=
+# Handle range 0x0000-0xffff
+@inline _can_title_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_t >>> t) & 1) != 0 && _check_tab(ct.can_t_tab[t+1], ch))
+=#
+const _can_title_bmp = _can_upper_bmp
+
+# Handle range 0x0000-0xffff
+@inline _is_lower_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
+
+# Handle range 0x10000-0x1ffff
+@inline _is_lower_slp(ch) =
+    (t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))
+
+# Handle range 0x0000-0xffff
+@inline _is_upper_bmp(ch) =
+    (t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))
+
+@inline _is_lower_ch(ch) =
+    ch <= 0x7f ? _islower_a(ch) :
+    ch <= 0xff ? _islower_l(ch) :
+    ch <= 0xffff ? _is_lower_bmp(ch) :
+    ch <= 0x1ffff ? _is_lower_slp(ch) : false
+
+@inline _is_upper_ch(ch) =
+    ch <= 0x7f ? _isupper_a(ch) :
+    ch <= 0xff ? _isupper_l(ch) :
+    ch <= 0xffff ? _is_upper_bmp(ch) :
+    ch <= 0x1ffff ? _is_upper_slp(ch) : false
+
+@inline _can_lower_ch(ch) =
+    ch <= 0x7f ? _isupper_a(ch) :
+    ch <= 0xff ? _isupper_l(ch) :
+    ch <= 0xffff ? _can_lower_bmp(ch) :
+    ch <= 0x1ffff ? _can_lower_slp(ch) : false
+
+@inline _can_upper_ch(ch) =
+    ch <= 0x7f ? _islower_a(ch) :
+    ch <= 0xff ? _can_upper_lat(ch) :
+    ch <= 0xffff ? _can_upper_bmp(ch) :
+    ch <= 0x1ffff ? _can_upper_slp(ch) : false
+
+const _can_title_ch = _can_upper_ch
+
+@inline _lower_ch(ch) =
+    ch <= 0x7f ? (_isupper_a(ch) ? ch + 0x20 : ch) :
+    ch <= 0xff ? (_isupper_l(ch) : ch + 0x20 : ch) :
+    ch <= 0xffff ? _lower_bmp(ch) :
+    ch <= 0x1ffff ? _lower_slp(ch) : ch
+
+@inline _upper_ch(ch) =
+    ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
+    ch <= 0xff ? _upper_lat(ch) :
+    ch <= 0xffff ? _upper_bmp(ch) :
+    ch <= 0x1ffff ? _upper_slp(ch) : ch
+
+@inline _title_ch(ch) =
+    ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
+    ch <= 0xff ? _upper_lat(ch) :
+    ch <= 0xffff ? _title_bmp(ch) :
+    ch <= 0x1ffff ? _upper_slp(ch) : ch