Skip to content

Commit

Permalink
Update to use tables for case
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Oct 17, 2018
1 parent bc78697 commit 3163363
Show file tree
Hide file tree
Showing 8 changed files with 648 additions and 147 deletions.
1 change: 1 addition & 0 deletions src/StrBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ include("types.jl")
@static V6_COMPAT && include("compat.jl")
@static NEW_ITERATE && include("fixparse.jl")
include("chars.jl")
include("charcase.jl")
include("access.jl")
include("traits.jl")
include("utf8proc.jl")
Expand Down
130 changes: 81 additions & 49 deletions src/casefold.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
Licensed under MIT License, see LICENSE.md
=#

_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)

@inline _wide_lower_ch(ch) =
ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch))

@inline _isupper_ch(ch) =
ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch))

_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))

_wide_out_upper(ch) =
ifelse(ch == 0xb5, 0x39c,
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))


function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
(len = ncodeunits(str)) == 0 && return str
@preserve str begin
pnt = pointer(str)
ch = get_codeunit(pnt)
_islower_a(ch) || return str
out = _allocate(len)
buf, out = _allocate(UInt8, len)
unsafe_copyto!(out, pnt, len)
set_codeunit!(out, ch - 0x20)
Str(C, out)
Str(C, buf)
end
end

Expand All @@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
pnt = pointer(str)
ch = get_codeunit(pnt)
_isupper_a(ch) || return str
out = _allocate(len)
buf, out = _allocate(UInt8, len)
unsafe_copyto!(out, pnt, len)
set_codeunit!(out, ch + 0x20)
Str(C, out)
Str(C, buf)
end
end

Expand Down Expand Up @@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
_can_upper(ch) || return str
buf, out = _allocate(UInt8, len)
set_codeunit!(out, ch - 0x20)
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1)
Str(C, buf)
end
end
Expand Down Expand Up @@ -154,10 +139,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
@preserve str begin
pnt = pointer(str)
ch = get_codeunit(pnt)
_isupper(ch) || return str
_isupper_al(ch) || return str
buf, out = _allocate(UInt8, len)
set_codeunit!(out, ch + 0x20)
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
len > 1 && unsafe_copyto!(out+1, pnt+1, len-1)
Str(C, buf)
end
end
Expand All @@ -176,7 +161,7 @@ function _upper(::Type{C}, beg::Ptr{UInt8}, off, len) where {C<:_LatinCSE}
out += off
while out < fin
ch = get_codeunit(out)
_can_upper(ch) && set_codeunit!(out, ch - 0x20)
_islower(ch) && set_codeunit!(out, ch - 0x20)
out += 1
end
Str(C, buf)
Expand Down Expand Up @@ -264,7 +249,7 @@ end
# result must have at least one character > 0xff, so if the only character(s)
# > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr

function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
function _lower(::Type{C}, beg, off, len) where {C<:Union{_UCS2CSE}}
CU = codeunit(C)
buf, out = _allocate(CU, len)
unsafe_copyto!(out, beg, len)
Expand All @@ -277,18 +262,20 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
elseif ch <= 0xff
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
elseif _isupper_u(ch)
ch = _lowercase_u(ch)
flg = ch <= 0xff
set_codeunit!(out, ch)
elseif ch <= 0xffff
if _can_lower_bmp(ch)
ch = _lower_bmp(ch)
flg = ch <= 0xff
set_codeunit!(out, ch)
end
end
out += sizeof(CU)
end
if flg && is_latin(buf)
out = pointer(buf)
buf = _allocate(len)
_narrow!(pointer(buf), out, out + len)
Str(_LatinCSE, buf)
buf8 = _allocate(len)
_narrow!(pointer(buf8), out, out + len)
Str(_LatinCSE, buf8)
else
Str(C, buf)
end
Expand All @@ -302,25 +289,74 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
out += off
while out < fin
ch = get_codeunit(out)
if ch <= 0x7f
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
elseif ch <= 0xff
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
elseif _isupper_u(ch)
set_codeunit!(out, _lowercase_u(ch))
if ch <= 0xff
_isupper_al(ch) && set_codeunit!(out, ch += 0x20)
elseif ch <= 0xffff
_can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch))
elseif ch <= 0x1ffff
_can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch))
end
out += sizeof(CU)
end
Str(C, buf)
end

function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}}
(len = ncodeunits(str)) == 0 && return str
@preserve str begin
pnt = pointer(str)
ch = get_codeunit(pnt)
(ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) :
ch <= 0x1ffff && _can_lower_slp(ch)) ||
return str
cl = _lower_ch(ch)
if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16))
buf8, out8 = _allocate(UInt8, len)
len > 1 && _narrow!(out8 + 1, pnt + 1, pnt + len - 1)
set_codeunit!(out8, cl)
Str(_LatinCSE, buf8)
else
buf, out = _allocate(codeunit(C), len)
len > 1 && unsafe_copyto!(out, pnt, len)
set_codeunit!(out, cl)
Str(C, buf)
end
end
end

function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
(len = ncodeunits(str)) == 0 && return str
@preserve str begin
pnt = pointer(str)
ch = get_codeunit(pnt)
_can_title_ch(ch) || return str
buf, out = _allocate(codeunit(C), len)
len > 1 && unsafe_copyto!(out, pnt, len)
set_codeunit!(out, _title_ch(ch))
Str(C, buf)
end
end

function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}}
(len = ncodeunits(str)) == 0 && return str
@preserve str begin
pnt = pointer(str)
ch = get_codeunit(pnt)
_can_lower_ch(ch) || return str
buf, out = _allocate(codeunit(C), len)
len > 1 && unsafe_copyto!(out, pnt, len)
set_codeunit!(out, _lower_ch(ch))
Str(C, buf)
end
end

function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
@preserve str begin
CU = codeunit(C)
pnt = beg = pointer(str)
fin = beg + sizeof(str)
while pnt < fin
_isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
_can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
pnt += sizeof(CU)
end
end
Expand All @@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
ch = get_codeunit(out)
if ch <= 0x7f
_islower_a(ch) && set_codeunit!(out, ch -= 0x20)
elseif ch > 0xff
_islower_u(ch) && set_codeunit!(out, _uppercase_u(ch))
elseif _can_upper(ch)
set_codeunit!(out, ch -= 0x20)
elseif ch == 0xb5
set_codeunit!(out, 0x39c)
elseif ch == 0xff
set_codeunit!(out, 0x178)
elseif !V6_COMPAT && ch == 0xdf
set_codeunit!(out, 0x1e9e)
elseif ch <= 0xff
set_codeunit!(out, _uppercase_l(ch))
elseif ch <= 0xffff
_can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch))
elseif ch <= 0x1ffff
_can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch))
end
out += sizeof(CU)
end
Expand All @@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
pnt = beg = pointer(str)
fin = beg + sizeof(str)
while pnt < fin
_wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
_can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
pnt += sizeof(CU)
end
str
Expand Down
130 changes: 130 additions & 0 deletions src/charcase.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#=
Case folding for Unicode characters
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
Licensed under MIT License, see LICENSE.md
=#

module CaseTables
include("maketables.jl")

const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
end # module CaseTables

using .CaseTables

const ct = CaseTables.ct

using ModuleInterfaceTools
@api extend ChrBase

_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)

_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))

_wide_out_upper(ch) =
ifelse(ch == 0xb5, 0x39c,
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))

_check_tab(off, ch) =
off != 0 && (CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0

@inline _get_tab(off, ch) =
off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch :
CaseTables.tupvec[off][(ch & 0x1f) + 1]

@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch)

@inline _upper_bmp(ch) =
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))

@inline _lower_bmp(ch) =
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))

@inline _title_bmp(ch) =
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) == 0 ? ch : _get_tab(ct.t_tab[(t>>1)+1], ch))

@inline _upper_slp(ch) =
(t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.u_tab[(t>>1)+1], ch))

@inline _lower_slp(ch) =
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(ct.l_tab[(t>>1)+1], ch))

# Handle range 0x0000-0xffff
@inline _can_lower_bmp(ch) =
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))

# Handle range 0x10000-0x1ffff
@inline _can_lower_slp(ch) =
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))

# Handle range 0x0000-0xffff
@inline _can_upper_bmp(ch) =
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))

# Handle range 0x10000-0x1ffff
@inline _can_upper_slp(ch) =
(t = (ch >>> 9); ((ct.s_can_u >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))

#=
# Handle range 0x0000-0xffff
@inline _can_title_bmp(ch) =
(t = (ch >>> 9); ((ct.can_t >>> t) & 1) != 0 && _check_tab(ct.can_t_tab[t+1], ch))
=#
const _can_title_bmp = _can_upper_bmp

# Handle range 0x0000-0xffff
@inline _is_lower_bmp(ch) =
(t = (ch >>> 9); ((ct.can_l >>> t) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))

# Handle range 0x10000-0x1ffff
@inline _is_lower_slp(ch) =
(t = (ch >>> 9); ((ct.s_can_l >>> (t & 0x7f)) & 1) != 0 && _check_tab(ct.can_l_tab[t+1], ch))

# Handle range 0x0000-0xffff
@inline _is_upper_bmp(ch) =
(t = (ch >>> 9); ((ct.can_u >>> t) & 1) != 0 && _check_tab(ct.can_u_tab[t+1], ch))

@inline _is_lower_ch(ch) =
ch <= 0x7f ? _islower_a(ch) :
ch <= 0xff ? _islower_l(ch) :
ch <= 0xffff ? _is_lower_bmp(ch) :
ch <= 0x1ffff ? _is_lower_slp(ch) : false

@inline _is_upper_ch(ch) =
ch <= 0x7f ? _isupper_a(ch) :
ch <= 0xff ? _isupper_l(ch) :
ch <= 0xffff ? _is_upper_bmp(ch) :
ch <= 0x1ffff ? _is_upper_slp(ch) : false

@inline _can_lower_ch(ch) =
ch <= 0x7f ? _isupper_a(ch) :
ch <= 0xff ? _isupper_l(ch) :
ch <= 0xffff ? _can_lower_bmp(ch) :
ch <= 0x1ffff ? _can_lower_slp(ch) : false

@inline _can_upper_ch(ch) =
ch <= 0x7f ? _islower_a(ch) :
ch <= 0xff ? _can_upper_lat(ch) :
ch <= 0xffff ? _can_upper_bmp(ch) :
ch <= 0x1ffff ? _can_upper_slp(ch) : false

const _can_title_ch = _can_upper_ch

@inline _lower_ch(ch) =
ch <= 0x7f ? (_isupper_a(ch) ? ch + 0x20 : ch) :
ch <= 0xff ? (_isupper_l(ch) : ch + 0x20 : ch) :
ch <= 0xffff ? _lower_bmp(ch) :
ch <= 0x1ffff ? _lower_slp(ch) : ch

@inline _upper_ch(ch) =
ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
ch <= 0xff ? _upper_lat(ch) :
ch <= 0xffff ? _upper_bmp(ch) :
ch <= 0x1ffff ? _upper_slp(ch) : ch

@inline _title_ch(ch) =
ch <= 0x7f ? (_islower_a(ch) ? ch - 0x20 : ch) :
ch <= 0xff ? _upper_lat(ch) :
ch <= 0xffff ? _title_bmp(ch) :
ch <= 0x1ffff ? _upper_slp(ch) : ch
Loading

0 comments on commit 3163363

Please sign in to comment.