diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 423d9fa..0000000 --- a/.travis.yml +++ /dev/null @@ -1,34 +0,0 @@ -## Documentation: http://docs.travis-ci.com/user/languages/julia/ -language: julia -os: - - linux - - osx - - windows -julia: - - 1.0 - - 1 - - nightly -notifications: - email: false -git: - depth: 99999999 - -## uncomment the following lines to allow failures on nightly julia -## (tests will run but not make your overall status red) -matrix: - allow_failures: - - julia: nightly - -## uncomment and modify the following lines to manually install system packages -#addons: -# apt: # apt-get for linux -# packages: -# - gfortran -#before_script: # homebrew for mac -# - if [ $TRAVIS_OS_NAME = osx ]; then brew install gcc; fi - -## uncomment the following lines to override the default test script - -after_success: - # push coverage results to Codecov - - julia -e 'using Pkg; cd(Pkg.dir("StrBase")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())' diff --git a/src/ascii.jl b/src/ascii.jl index 20c7ef9..e8c9adc 100644 --- a/src/ascii.jl +++ b/src/ascii.jl @@ -1,32 +1,12 @@ #= ASCIIStr type -Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones, +Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones, and other contributors to the Julia language Licensed under MIT License, see LICENSE.md Based in part on code for ASCIIString that used to be in Julia =# -## overload methods for efficiency ## - -function _string(coll) - n = 0 - for str in coll - n += ncodeunits(str) - end - buf, out = _allocate(UInt8, n) - for str in coll - @preserve str begin - len = ncodeunits(str) - unsafe_copyto!(out, pointer(str), len) - out += len - end - end - buf -end - -string(c::MaybeSub{<:Str{ASCIICSE}}...) = length(c) == 1 ? c[1] : Str(ASCIICSE, _string(c)) - ## transcoding to ASCII ## function convert(::Type{<:Str{ASCIICSE}}, str::AbstractString) diff --git a/src/latin.jl b/src/latin.jl index 14c59bd..9f63fcb 100644 --- a/src/latin.jl +++ b/src/latin.jl @@ -1,7 +1,8 @@ #= LatinStr/_LatinStr type (ISO Latin1 8-bit subset of Unicode) -Copyright 2017 Gandalf Software, Inc., Scott P. Jones, and other contributors to the Julia language +Copyright 2017, 2020 Gandalf Software, Inc., Scott P. Jones, +and other contributors to the Julia language Licensed under MIT License, see LICENSE.md Based in part on code for ASCIIString that used to be in Julia =# @@ -13,23 +14,6 @@ is_latin(str::MaybeSub{<:Str{<:LatinCSE}}) = true is_bmp(str::MS_Latin) = true is_unicode(str::MS_Latin) = true -const MS_ASCIILatin = MaybeSub{<:Str{<:Union{ASCIICSE, Latin_CSEs}}} - -function string(collection::MS_ASCIILatin...) - length(collection) == 1 && return collection[1] - len = 0 - @inbounds for str in collection - len += ncodeunits(str) - end - buf, pnt = _allocate(len) - @inbounds for str in collection - len = ncodeunits(str) - _memcpy(pnt, pointer(str), len) - pnt += len - end - Str(LatinCSE, buf) -end - ## transcoding to Latin1 ## function convert(::Type{<:Str{C}}, str::AbstractString) where {C<:Latin_CSEs} diff --git a/src/utf16.jl b/src/utf16.jl index e3a46e6..cbd3d7b 100644 --- a/src/utf16.jl +++ b/src/utf16.jl @@ -11,41 +11,47 @@ const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00 const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000 const _big_trail_mask = _widen_mask(_trail_mask) -const _big_hi_bit_16 = _widen_mask(_big_hi_bit_16) +const _big_hi_bit_16 = _widen_mask(_hi_bit_16) @inline _mask_surr(v, msk) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & msk, msk) -@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask)) -@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask)) +@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask), _hi_bit_16) +@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask), _big_hi_bit_16) @inline _get_masked(qpnt::Ptr) = _get_masked(unsafe_load(qpnt)) @inline _get_lead(qpnt::Ptr{UInt}) = xor(_get_masked(qpnt), _hi_bit_16) @inline _get_lead(qpnt::Ptr{BigChunk}) = xor(_get_masked(qpnt), _big_hi_bit_16) -@inline function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) - # First check very frequent cases of short strings - # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24) - # taking advantage of the knowledge of how String types are stored in Julia, - # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2 - cnt <<= 1 - if cnt <= BIGCHUNKSZ - return (cnt <= CHUNKSZ - ? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt)) - : count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt)) - end +## overload methods for efficiency ## + +function _length_utf16_al(beg::Ptr{UInt16}, cnt::Int) len = count_ones(_get_lead(_pntchunk(beg))) cnt -= CHUNKSZ pnt = _pntbigchunk(beg + CHUNKSZ) v = _get_lead(pnt) - cnt <= BIGCHUNKSZ && return len + count_ones(_mask_bytes(v, cnt)) - fin = pnt + cnt - while (pnt += BIGCHUNKSZ) < fin - len += count_ones(v) - v = _get_lead(pnt) + if cnt > BIGCHUNKSZ + fin = pnt + cnt + while (pnt += BIGCHUNKSZ) < fin + len += count_ones(v) + v = _get_lead(pnt) + end end len + count_ones(_mask_bytes(v, cnt)) end +function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) + # First check very frequent cases of short strings + # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24) + # taking advantage of the knowledge of how String types are stored in Julia, + # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2 + cnt <<= 1 + (cnt <= BIGCHUNKSZ + ? (cnt <= CHUNKSZ + ? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt))) + : count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt)))) + : _length_utf16_al(beg, cnt)) +end + function _length_ul(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) align = reinterpret(UInt, beg) pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK) @@ -104,20 +110,6 @@ function _prevind(::MultiCU, str::MS_UTF16, pos::Int, nchar::Int) end # Check for any surrogate characters -function is_bmp(str::MS_UTF16) - (siz = sizeof(str)) == 0 && return true - # Todo: handle unaligned for ARM32 - @preserve str begin - siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0 - - pnt, fin = _calcpnt(str, siz) - while (pnt += CHUNKSZ) <= fin - _get_masked(pnt) == 0 || return false - end - pnt - CHUNKSZ == fin || (_get_masked(pnt) & _mask_bytes(siz)) == 0 - end -end - @inline function _check_bmp_utf16_al(beg, cnt) cnt <= CHUNKSZ && return _mask_bytes(_get_masked(_pntchunk(beg)), cnt) == 0 cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0 diff --git a/src/utf8.jl b/src/utf8.jl index bd41a4f..71b07b1 100644 --- a/src/utf8.jl +++ b/src/utf8.jl @@ -226,7 +226,6 @@ _all_latin(val) = @inline function _check_latin_utf8_al(beg, cnt) cnt <= CHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt)) - bigmsk = _widen_mask(msk) cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt)) _all_latin(unsafe_load(_pntchunk(ptr))) || return false cnt -= CHUNKSZ @@ -601,15 +600,6 @@ _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int, nchar::Int) = _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int) = prevind(str.data, pos) -#= -const _ByteStr = Union{Str{ASCIICSE}, SubString{<:Str{ASCIICSE}}, - Str{UTF8CSE}, SubString{<:Str{UTF8CSE}}} - -string(s::_ByteStr) = s -string(s::_ByteStr, c::_ByteStr...) = UTF8Str(_string(c)) - # ^^ at least one must be UTF-8 or the ASCII-only method would get called -=# - function _reverse(::MultiCU, ::Type{UTF8CSE}, len, pnt::Ptr{T}) where {T<:CodeUnitTypes} buf, beg = _allocate(T, len) out = beg + len diff --git a/src/util.jl b/src/util.jl index 045b650..a4c39e4 100644 --- a/src/util.jl +++ b/src/util.jl @@ -7,6 +7,144 @@ Licensed under MIT License, see LICENSE.md Based initially on julia/test/strings/util.jl =# +function _concat(T, a, b) + la = ncodeunits(a) + lb = ncodeunits(b) + buf, out = _allocate(T, la + lb) + @preserve a unsafe_copyto!(out, pointer(a), la) + @preserve b unsafe_copyto!(out + la, pointer(b), lb) + buf +end + +function _string(T, a, b, rest) + la = ncodeunits(a) + lb = ncodeunits(b) + len = la + lb + @inbounds for str in rest + len += ncodeunits(str) + end + buf, out = _allocate(T, len) + @preserve a unsafe_copyto!(out, pointer(a), la) + out += la + @preserve b unsafe_copyto!(out, pointer(b), lb) + out += lb + @inbounds for str in rest + len = ncodeunits(str) + @preserve str unsafe_copyto!(out, pointer(str), len) + out += len + end + buf +end + +function _string(T, coll) + len = 0 + @inbounds for str in coll + len += ncodeunits(str) + end + buf, out = _allocate(T, len) + @inbounds for str in coll + len = ncodeunits(str) + @preserve str unsafe_copyto!(out, pointer(str), len) + out += len + end + buf +end + +# Handle concatenation where all the same CSE for strings, and character set for characters +#= +""" +WIP: this is rather tricky. +It really should handle any type of Chr / Str / CSE, not just the ones defined +in CharSetEncodings, ChrBase and StrBase +Ideally, it could also handle mixes with String and Char (or other AbstractString / AbstractChar +types. +It may need to do two or even three passes, one to determine the correct type to be output, +another to determine the output length, and finally another to copy the strings / characters into +the buffer. +The result type should be based on promotion rules, i.e. outputting UCS2Str if only ASCII, Latin, UCS2 characters and strings are in the list. +This is difficult to do in a way that will still be type stable. +""" + +function _string_chr(a::Union{<:Chr{CS,T}, <:Str{C}, SubString{<:Str{C}}}... + ) where {CS<:CharSet,T,C<:CSE{CS}} + len = 0 + for v in a + if v isa Chr + len += 1 + else + len += ncodeunits(v) + end + end + buf, out = _allocate(T, len) + for v in a + len = ncodeunits(str) + @preserve str unsafe_copyto!(out, pointer(str), len) + out += len + end + buf +end +=# + +string(c::MaybeSub{<:Str}) = c +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c)) +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c)) +string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c)) +string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c)) +string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c)) + +#= +const MS_Str{C} = MaybeSub{<:Str{C}} +string(a::MS_Str{C}, b::MS_Str{C}) where {C<:CSE} = Str(C, _concat(codeunit(C), a, b)) +string(a::MS_Str{C}, b::MS_Str{C}, c::MS_Str{C}...) where {C<:CSE} = + Str(C, _string(codeunit(C), a, b, c)) + +string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b)) +string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b)) +string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b)) + +const MS_AL = MS_Str{<:Union{ASCIICSE,Latin_CSEs}} +string(a::MS_AL, b::MS_AL) = Str(LatinCSE, _concat(UInt8, a, b)) +string(a::MS_AL, b::MS_AL, c::MS_AL...) = Str(LatinCSE, _string(UInt8, a, b, c)) + +const MS_AU = MS_Str{<:Union{ASCIICSE,UTF8CSE}} +string(a::MS_AU, b::MS_AU) = Str(UTF8CSE, _concat(UInt8, a, b)) +string(a::MS_AU, b::MS_AU, c::MS_AU...) = Str(UTF8CSE, _string(UInt8, a, b, c)) + +const MS_U2 = MS_Str{<:UCS2_CSEs} +string(a::MS_U2, b::MS_U2) = Str(UCS2CSE, _concat(UInt16, a, b)) +string(a::MS_U2, b::MS_U2, c::MS_U2...) = Str(UCS2CSE, _string(UInt16, a, b, c)) + +const MS_UT = MS_Str{<:Union{UCS2_CSEs,UTF16CSE}} +string(a::MS_UT, b::MS_UT) = Str(UTF16CSE, _concat(UInt16, a, b)) +string(a::MS_UT, b::MS_UT, c::MS_UT...) = Str(UTF16CSE, _string(UInt16, a, b, c)) + +const MS_U4 = MS_Str{<:UTF32_CSEs} +string(a::MS_U4, b::MS_U4) = Str(UTF32CSE, _concat(UInt32, a, b)) +string(a::MS_U4, b::MS_U4, c::MS_U4...) = Str(UTF32CSE, _string(UInt32, a, b, c)) +=# + +#= +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = + length(c) == 1 ? c[1] : Str(LatinCSE, _string(UInt8, c)) + +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = + length(c) == 1 ? c[1] : Str(UTF8CSE, _string(UInt8, c)) + +string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = + length(c) == 1 ? c[1] : Str(UCS2CSE, _string(UInt16, c)) + +string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = + length(c) == 1 ? c[1] : Str(UTF16CSE, _string(UInt16, c)) + +string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = + length(c) == 1 ? c[1] : Str(UTF32CSE, _string(UInt32, c)) +=# +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c)) +string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c)) +string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c)) +string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c)) +string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c)) + # starts with and ends with predicates starts_with(a::MaybeSub{<:Str{C}}, b::MaybeSub{<:Str{C}}) where {C<:CSE} = diff --git a/test/util.jl b/test/util.jl index d85645b..3f77471 100644 --- a/test/util.jl +++ b/test/util.jl @@ -307,6 +307,31 @@ #non-hex characters @test_throws ArgumentError hex2bytes(b"0123456789abcdefABCDEFGH") end + + @testset "Concatenation" begin + asc = ASCIIStr("foo") + lat = LatinStr("bar") + ucs = UCS2Str("baz") + u32 = UTF32Str("silly") + ut8 = UTF8Str("test") + ut16 = UTF16Str("ugly") + haslat = _LatinStr("você") + hasucs = _UCS2Str("†") + hasu32 = _UTF32Str("\U1f596") + @test typeof(asc * asc) == ASCIIStr + @test typeof(asc * lat) == LatinStr + @test typeof(asc * ut8) == UTF8Str + @test typeof(asc * haslat) == LatinStr + @test typeof(lat * lat) == LatinStr + @test typeof(haslat * haslat) == _LatinStr + @test typeof(lat * haslat) == LatinStr + @test typeof(ucs * ucs) == UCS2Str + @test typeof(hasucs * hasucs) == _UCS2Str + @test typeof(ucs * hasucs) == UCS2Str + @test typeof(u32 * u32) == UTF32Str + @test typeof(hasu32 * hasu32) == _UTF32Str + @test typeof(u32 * hasu32) == UTF32Str + end end # b"" should be immutable