Skip to content

Commit

Permalink
Update iterate/next for utf8
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Oct 23, 2018
1 parent ee7a6bb commit c8bc88b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 51 deletions.
76 changes: 34 additions & 42 deletions src/support.jl
Original file line number Diff line number Diff line change
Expand Up @@ -910,57 +910,49 @@ end

(^)(ch::CP, cnt::Integer) where {CP <: Chrs} = repeat(ch, cnt)

#=
function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{ASCIIChr,LatinChr}}
cnt == 0 && return empty_str(CS)
cnt < 0 && repeaterr(cnt)
buf, pnt = _allocate(UInt8, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
Str(CS, buf)
end
function _repeat(::Type{CS}, ch::C, cnt::Integer) where {CS<:CSE,C<:Union{UCS2Chr,UTF32Chr}}
cnt == 0 && return empty_str(CS)
cnt < 0 && repeaterr(cnt)
CU = codeunit(CS)
buf, pnt = _allocate(CU, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%CU) : _aligned_set(pnt, ch%CU, cnt)
Str(CS, buf)
end
repeat(ch::ASCIIChr, cnt::Integer) = _repeat(ASCIICSE, ch, cnt)
repeat(ch::LatinChr, cnt::Integer) = _repeat(LatinCSE, ch, cnt)
repeat(ch::UCS2Chr, cnt::Integer) = _repeat(UCS2CSE, ch, cnt)
repeat(ch::UTF32Chr, cnt::Integer) = _repeat(UTF32CSE, ch, cnt)
=#

function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr,_LatinChr}}
cnt == 0 && return empty_str(ASCIICSE)
cnt < 0 && repeaterr(cnt)
cu = ch%UInt8
buf, pnt = _allocate(UInt8, cnt)
_memset(pnt, cu, cnt)
Str((C == ASCIIChr || cu <= 0x7f) ? ASCIICSE : (C == _LatinChr ? _LatinCSE : LatinCSE), buf)
end

function repeat(ch::C, cnt::Integer) where {C<:Union{UCS2Chr,UTF32Chr}}
cnt == 0 && return empty_str(ASCIICSE)
cnt < 0 && repeaterr(cnt)
if ch%UInt32 <= 0xff
function repeat(ch::C, cnt::Integer) where {C<:Union{ASCIIChr,LatinChr}}
if cnt > 0
cu = ch%UInt8
buf, pnt = _allocate(UInt8, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt8) : _memset(pnt, ch%UInt8, cnt)
Str(ifelse(ch%UInt8 <= 0x7f, ASCIICSE, LatinCSE), buf)
elseif C == UCS2Chr || ch%UInt32 <= 0xffff
_memset(pnt, cu, cnt)
C == ASCIIChr ? Str(ASCIICSE, buf) : Str(LatinCSE, buf)
else
cnt < 0 ? repeaterr(cnt) : C == ASCIIStr ? empty_ascii : empty_latin
end
end

function repeat(ch::_LatinChr, cnt::Integer)
if cnt > 0
cu = ch%UInt8
buf, pnt = _allocate(UInt8, cnt)
_memset(pnt, cu, cnt)
cu <= 0x7f ? Str(ASCIICSE, buf) : Str(_LatinCSE, buf)
else
cnt == 0 ? empty_ascii : repeaterr(cnt)
end
end

function repeat(ch::UCS2Chr, cnt::Integer)
if cnt > 0
buf, pnt = _allocate(UInt16, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt16) : _aligned_set(pnt, ch%UInt16, cnt)
Str(UCS2CSE, buf)
else
cnt == 0 ? empty_ucs2 : repeaterr(cnt)
end
end

function repeat(ch::UTF32Chr, cnt::Integer)
if cnt > 0
buf, pnt = _allocate(UInt32, cnt)
cnt == 1 && set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
cnt == 1 ? set_codeunit!(pnt, ch%UInt32) : _aligned_set(pnt, ch%UInt32, cnt)
Str(UTF32CSE, buf)
else
cnt == 0 ? empty_utf32 : repeaterr(cnt)
end
end


# Definitions for C compatible strings, that don't allow embedded
# '\0', and which are terminated by a '\0'

Expand Down
2 changes: 1 addition & 1 deletion src/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ end

@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF16, pos::Int) where {T}
@boundscheck pos <= ncodeunits(str) || boundserr(str, pos)
_iterate(MultiCU(), T, str, pos)
iterate(str, pos)
end

@inline _thisind(::MultiCU, str::MS_UTF16, len, pnt, pos) =
Expand Down
18 changes: 10 additions & 8 deletions src/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -361,27 +361,29 @@ function _iterate_utf8(ch, str, pnt, pos)
end
end

@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
pos > ncodeunits(str) && return nothing
@boundscheck pos <= 0 && boundserr(str, pos)
@inline function _iterate_utf8(str, pos)
@preserve str begin
pnt = pointer(str) + pos - 1
ch = get_codeunit(pnt)
ch <= 0x7f ? (UTF32Chr(ch), pos + 1) : _iterate_utf8(ch, str, pnt, pos)
end
end

@propagate_inbounds function iterate(str::MS_UTF8, pos::Integer=1)
pos > ncodeunits(str) && return nothing
@boundscheck pos <= 0 && boundserr(str, pos)
_iterate_utf8(str, pos)
end

_iterate(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
iterate(str.data, pos)
_iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)

# Gets next codepoint
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8,
pos::Int) where {T<:Chr}
len = ncodeunits(str)
@boundscheck 0 < pos <= len || boundserr(str, pos)
_iterate(MultiCU(), T, str, pos)
@propagate_inbounds function _next(::MultiCU, ::Type{T}, str::MS_UTF8, pos::Int) where {T<:Chr}
@boundscheck 0 < pos <= ncodeunits(str) || boundserr(str, pos)
_iterate_utf8(str, pos)
end

_next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
Expand Down

0 comments on commit c8bc88b

Please sign in to comment.