Update iterate/next for utf8

JuliaString · Oct 23, 2018 · c8bc88b · c8bc88b
1 parent ee7a6bb
commit c8bc88b
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 51 deletions.
diff --git a/src/support.jl b/src/support.jl
@@ -910,57 +910,49 @@ end
 
 (^)(ch::CP, cnt::Integer) where {CP <: Chrs} = repeat(ch, cnt)
 
-#=
-function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{ASCIIChr,LatinChr}}
-    cnt == 0 && return empty_str(CS)
-    cnt < 0 && repeaterr(cnt)
-    buf, pnt = _allocate(UInt8, cnt)
-    cnt == 1 ? set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
-    Str(CS, buf)
-end
-
-function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{UCS2Chr,UTF32Chr}}
-    cnt == 0 && return empty_str(CS)
-    cnt < 0 && repeaterr(cnt)
-    CU = codeunit(CS)
-    buf, pnt = _allocate(CU, cnt)
-    cnt == 1 ? set_codeunit!(pnt, ch%CU) : _aligned_set(pnt, ch%CU, cnt)
-    Str(CS, buf)
-end
-
-repeat(ch::ASCIIChr, cnt::Integer) = _repeat(ASCIICSE, ch, cnt)
-repeat(ch::LatinChr, cnt::Integer) = _repeat(LatinCSE, ch, cnt)
-repeat(ch::UCS2Chr,  cnt::Integer) = _repeat(UCS2CSE,  ch, cnt)
-repeat(ch::UTF32Chr, cnt::Integer) = _repeat(UTF32CSE, ch, cnt)
-=#
-
-function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr,_LatinChr}}
-    cnt == 0 && return empty_str(ASCIICSE)
-    cnt < 0 && repeaterr(cnt)
-    cu = ch%UInt8
-    buf, pnt = _allocate(UInt8, cnt)
-    _memset(pnt, cu, cnt)
-    Str((C == ASCIIChr || cu <= 0x7f) ? ASCIICSE : (C == _LatinChr ? _LatinCSE : LatinCSE), buf)
-end
-
-function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
-    cnt == 0 && return empty_str(ASCIICSE)
-    cnt < 0 && repeaterr(cnt)
-    if ch%UInt32 <= 0xff
+function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr}}
+    if cnt > 0
+        cu = ch%UInt8
         buf, pnt = _allocate(UInt8, cnt)
-        cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
-        Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
-    elseif C == UCS2Chr || ch%UInt32 <= 0xffff
+        _memset(pnt, cu, cnt)
+        C == ASCIIChr ? Str(ASCIICSE, buf) : Str(LatinCSE, buf)
+    else
+        cnt < 0 ? repeaterr(cnt) : C == ASCIIStr ? empty_ascii : empty_latin
+    end
+end
+
+function repeat(ch::_LatinChr, cnt::Integer)
+    if cnt > 0
+        cu = ch%UInt8
+        buf, pnt = _allocate(UInt8, cnt)
+        _memset(pnt, cu, cnt)
+        cu <= 0x7f ? Str(ASCIICSE, buf) : Str(_LatinCSE, buf)
+    else
+        cnt == 0 ? empty_ascii : repeaterr(cnt)
+    end
+end
+
+function repeat(ch::UCS2Chr, cnt::Integer)
+    if cnt > 0
         buf, pnt = _allocate(UInt16, cnt)
-        cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
+        cnt == 1 ? set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
         Str(UCS2CSE, buf)
     else
+        cnt == 0 ? empty_ucs2 : repeaterr(cnt)
+    end
+end
+
+function repeat(ch::UTF32Chr, cnt::Integer)
+    if cnt > 0
         buf, pnt = _allocate(UInt32, cnt)
-        cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
+        cnt == 1 ? set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
         Str(UTF32CSE, buf)
+    else
+        cnt == 0 ? empty_utf32 : repeaterr(cnt)
     end
 end
 
+
 # Definitions for C compatible strings, that don't allow embedded
 # '\0', and which are terminated by a '\0'
 

diff --git a/src/utf16.jl b/src/utf16.jl
@@ -186,7 +186,7 @@ end
 
 @propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF16, pos::Int) where {T}
     @boundscheck pos <= ncodeunits(str) || boundserr(str, pos)
-    _iterate(MultiCU(), T, str, pos)
+    iterate(str, pos)
 end
 
 @inline _thisind(::MultiCU, str::MS_UTF16, len, pnt, pos) =

diff --git a/src/utf8.jl b/src/utf8.jl
@@ -361,27 +361,29 @@ function _iterate_utf8(ch, str, pnt, pos)
     end
 end
 
-@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
-    pos > ncodeunits(str) && return nothing
-    @boundscheck pos <= 0 && boundserr(str, pos)
+@inline function _iterate_utf8(str, pos)
     @preserve str begin
         pnt = pointer(str) + pos - 1
         ch = get_codeunit(pnt)
         ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
     end
 end
 
+@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
+    pos > ncodeunits(str) && return nothing
+    @boundscheck pos <= 0 && boundserr(str, pos)
+    _iterate_utf8(str, pos)
+end
+
 _iterate(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
     iterate(str.data, pos)
 _iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
     iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)
 
 # Gets next codepoint
-@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8,
-                                   pos::Int) where {T<:Chr}
-    len = ncodeunits(str)
-    @boundscheck 0 < pos <= len || boundserr(str, pos)
-    _iterate(MultiCU(), T, str, pos)
+@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8, pos::Int) where {T<:Chr}
+    @boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
+    _iterate_utf8(str, pos)
 end
 
 _next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =