Skip to content

Commit

Permalink
Change to use GitHub Actions
Browse files Browse the repository at this point in the history
  • Loading branch information
ScottPJones committed Jan 17, 2021
1 parent 98fedf9 commit e797807
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 116 deletions.
34 changes: 0 additions & 34 deletions .travis.yml

This file was deleted.

22 changes: 1 addition & 21 deletions src/ascii.jl
Original file line number Diff line number Diff line change
@@ -1,32 +1,12 @@
#=
ASCIIStr type
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
and other contributors to the Julia language
Licensed under MIT License, see LICENSE.md
Based in part on code for ASCIIString that used to be in Julia
=#

## overload methods for efficiency ##

function _string(coll)
n = 0
for str in coll
n += ncodeunits(str)
end
buf, out = _allocate(UInt8, n)
for str in coll
@preserve str begin
len = ncodeunits(str)
unsafe_copyto!(out, pointer(str), len)
out += len
end
end
buf
end

string(c::MaybeSub{<:Str{ASCIICSE}}...) = length(c) == 1 ? c[1] : Str(ASCIICSE, _string(c))

## transcoding to ASCII ##

function convert(::Type{<:Str{ASCIICSE}}, str::AbstractString)
Expand Down
20 changes: 2 additions & 18 deletions src/latin.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#=
LatinStr/_LatinStr type (ISO Latin1 8-bit subset of Unicode)
Copyright 2017 Gandalf Software, Inc., Scott P. Jones, and other contributors to the Julia language
Copyright 2017, 2020 Gandalf Software, Inc., Scott P. Jones,
and other contributors to the Julia language
Licensed under MIT License, see LICENSE.md
Based in part on code for ASCIIString that used to be in Julia
=#
Expand All @@ -13,23 +14,6 @@ is_latin(str::MaybeSub{<:Str{<:LatinCSE}}) = true
is_bmp(str::MS_Latin) = true
is_unicode(str::MS_Latin) = true

const MS_ASCIILatin = MaybeSub{<:Str{<:Union{ASCIICSE, Latin_CSEs}}}

function string(collection::MS_ASCIILatin...)
length(collection) == 1 && return collection[1]
len = 0
@inbounds for str in collection
len += ncodeunits(str)
end
buf, pnt = _allocate(len)
@inbounds for str in collection
len = ncodeunits(str)
_memcpy(pnt, pointer(str), len)
pnt += len
end
Str(LatinCSE, buf)
end

## transcoding to Latin1 ##

function convert(::Type{<:Str{C}}, str::AbstractString) where {C<:Latin_CSEs}
Expand Down
58 changes: 25 additions & 33 deletions src/utf16.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,41 +11,47 @@ const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000

const _big_trail_mask = _widen_mask(_trail_mask)
const _big_hi_bit_16 = _widen_mask(_big_hi_bit_16)
const _big_hi_bit_16 = _widen_mask(_hi_bit_16)

@inline _mask_surr(v, msk) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & msk, msk)

@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask))
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask), _hi_bit_16)
@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask), _big_hi_bit_16)
@inline _get_masked(qpnt::Ptr) = _get_masked(unsafe_load(qpnt))

@inline _get_lead(qpnt::Ptr{UInt}) = xor(_get_masked(qpnt), _hi_bit_16)
@inline _get_lead(qpnt::Ptr{BigChunk}) = xor(_get_masked(qpnt), _big_hi_bit_16)

@inline function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
# First check very frequent cases of short strings
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
# taking advantage of the knowledge of how String types are stored in Julia,
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
cnt <<= 1
if cnt <= BIGCHUNKSZ
return (cnt <= CHUNKSZ
? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt))
: count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))
end
## overload methods for efficiency ##

function _length_utf16_al(beg::Ptr{UInt16}, cnt::Int)
len = count_ones(_get_lead(_pntchunk(beg)))
cnt -= CHUNKSZ
pnt = _pntbigchunk(beg + CHUNKSZ)
v = _get_lead(pnt)
cnt <= BIGCHUNKSZ && return len + count_ones(_mask_bytes(v, cnt))
fin = pnt + cnt
while (pnt += BIGCHUNKSZ) < fin
len += count_ones(v)
v = _get_lead(pnt)
if cnt > BIGCHUNKSZ
fin = pnt + cnt
while (pnt += BIGCHUNKSZ) < fin
len += count_ones(v)
v = _get_lead(pnt)
end
end
len + count_ones(_mask_bytes(v, cnt))
end

function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
# First check very frequent cases of short strings
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
# taking advantage of the knowledge of how String types are stored in Julia,
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
cnt <<= 1
(cnt <= BIGCHUNKSZ
? (cnt <= CHUNKSZ
? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt)))
: count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))))
: _length_utf16_al(beg, cnt))
end

function _length_ul(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
align = reinterpret(UInt, beg)
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
Expand Down Expand Up @@ -104,20 +110,6 @@ function _prevind(::MultiCU, str::MS_UTF16, pos::Int, nchar::Int)
end

# Check for any surrogate characters
function is_bmp(str::MS_UTF16)
(siz = sizeof(str)) == 0 && return true
# Todo: handle unaligned for ARM32
@preserve str begin
siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0

pnt, fin = _calcpnt(str, siz)
while (pnt += CHUNKSZ) <= fin
_get_masked(pnt) == 0 || return false
end
pnt - CHUNKSZ == fin || (_get_masked(pnt) & _mask_bytes(siz)) == 0
end
end

@inline function _check_bmp_utf16_al(beg, cnt)
cnt <= CHUNKSZ && return _mask_bytes(_get_masked(_pntchunk(beg)), cnt) == 0
cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0
Expand Down
10 changes: 0 additions & 10 deletions src/utf8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ _all_latin(val) =

@inline function _check_latin_utf8_al(beg, cnt)
cnt <= CHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt))
bigmsk = _widen_mask(msk)
cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
_all_latin(unsafe_load(_pntchunk(ptr))) || return false
cnt -= CHUNKSZ
Expand Down Expand Up @@ -601,15 +600,6 @@ _prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int, nchar::Int) =
_prevind(::MultiCU, str::Str{RawUTF8CSE}, pos::Int) =
prevind(str.data, pos)

#=
const _ByteStr = Union{Str{ASCIICSE}, SubString{<:Str{ASCIICSE}},
Str{UTF8CSE}, SubString{<:Str{UTF8CSE}}}
string(s::_ByteStr) = s
string(s::_ByteStr, c::_ByteStr...) = UTF8Str(_string(c))
# ^^ at least one must be UTF-8 or the ASCII-only method would get called
=#

function _reverse(::MultiCU, ::Type{UTF8CSE}, len, pnt::Ptr{T}) where {T<:CodeUnitTypes}
buf, beg = _allocate(T, len)
out = beg + len
Expand Down
138 changes: 138 additions & 0 deletions src/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,144 @@ Licensed under MIT License, see LICENSE.md
Based initially on julia/test/strings/util.jl
=#

function _concat(T, a, b)
la = ncodeunits(a)
lb = ncodeunits(b)
buf, out = _allocate(T, la + lb)
@preserve a unsafe_copyto!(out, pointer(a), la)
@preserve b unsafe_copyto!(out + la, pointer(b), lb)
buf
end

function _string(T, a, b, rest)
la = ncodeunits(a)
lb = ncodeunits(b)
len = la + lb
@inbounds for str in rest
len += ncodeunits(str)
end
buf, out = _allocate(T, len)
@preserve a unsafe_copyto!(out, pointer(a), la)
out += la
@preserve b unsafe_copyto!(out, pointer(b), lb)
out += lb
@inbounds for str in rest
len = ncodeunits(str)
@preserve str unsafe_copyto!(out, pointer(str), len)
out += len
end
buf
end

function _string(T, coll)
len = 0
@inbounds for str in coll
len += ncodeunits(str)
end
buf, out = _allocate(T, len)
@inbounds for str in coll
len = ncodeunits(str)
@preserve str unsafe_copyto!(out, pointer(str), len)
out += len
end
buf
end

# Handle concatenation where all the same CSE for strings, and character set for characters
#=
"""
WIP: this is rather tricky.
It really should handle any type of Chr / Str / CSE, not just the ones defined
in CharSetEncodings, ChrBase and StrBase
Ideally, it could also handle mixes with String and Char (or other AbstractString / AbstractChar
types.
It may need to do two or even three passes, one to determine the correct type to be output,
another to determine the output length, and finally another to copy the strings / characters into
the buffer.
The result type should be based on promotion rules, i.e. outputting UCS2Str if only ASCII, Latin, UCS2 characters and strings are in the list.
This is difficult to do in a way that will still be type stable.
"""
function _string_chr(a::Union{<:Chr{CS,T}, <:Str{C}, SubString{<:Str{C}}}...
) where {CS<:CharSet,T,C<:CSE{CS}}
len = 0
for v in a
if v isa Chr
len += 1
else
len += ncodeunits(v)
end
end
buf, out = _allocate(T, len)
for v in a
len = ncodeunits(str)
@preserve str unsafe_copyto!(out, pointer(str), len)
out += len
end
buf
end
=#

string(c::MaybeSub{<:Str}) = c
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))

#=
const MS_Str{C} = MaybeSub{<:Str{C}}
string(a::MS_Str{C}, b::MS_Str{C}) where {C<:CSE} = Str(C, _concat(codeunit(C), a, b))
string(a::MS_Str{C}, b::MS_Str{C}, c::MS_Str{C}...) where {C<:CSE} =
Str(C, _string(codeunit(C), a, b, c))
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
string(a::T, b::T) where {T<:MS_Str{ASCIICSE}} = string(ASCIICSE, _concat(UInt8, a, b))
const MS_AL = MS_Str{<:Union{ASCIICSE,Latin_CSEs}}
string(a::MS_AL, b::MS_AL) = Str(LatinCSE, _concat(UInt8, a, b))
string(a::MS_AL, b::MS_AL, c::MS_AL...) = Str(LatinCSE, _string(UInt8, a, b, c))
const MS_AU = MS_Str{<:Union{ASCIICSE,UTF8CSE}}
string(a::MS_AU, b::MS_AU) = Str(UTF8CSE, _concat(UInt8, a, b))
string(a::MS_AU, b::MS_AU, c::MS_AU...) = Str(UTF8CSE, _string(UInt8, a, b, c))
const MS_U2 = MS_Str{<:UCS2_CSEs}
string(a::MS_U2, b::MS_U2) = Str(UCS2CSE, _concat(UInt16, a, b))
string(a::MS_U2, b::MS_U2, c::MS_U2...) = Str(UCS2CSE, _string(UInt16, a, b, c))
const MS_UT = MS_Str{<:Union{UCS2_CSEs,UTF16CSE}}
string(a::MS_UT, b::MS_UT) = Str(UTF16CSE, _concat(UInt16, a, b))
string(a::MS_UT, b::MS_UT, c::MS_UT...) = Str(UTF16CSE, _string(UInt16, a, b, c))
const MS_U4 = MS_Str{<:UTF32_CSEs}
string(a::MS_U4, b::MS_U4) = Str(UTF32CSE, _concat(UInt32, a, b))
string(a::MS_U4, b::MS_U4, c::MS_U4...) = Str(UTF32CSE, _string(UInt32, a, b, c))
=#

#=
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) =
length(c) == 1 ? c[1] : Str(LatinCSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) =
length(c) == 1 ? c[1] : Str(UTF8CSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) =
length(c) == 1 ? c[1] : Str(UCS2CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) =
length(c) == 1 ? c[1] : Str(UTF16CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) =
length(c) == 1 ? c[1] : Str(UTF32CSE, _string(UInt32, c))
=#
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,Latin_CSEs}}}...) = Str(LatinCSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:Union{ASCIICSE,UTF8CSE}}}...) = Str(UTF8CSE, _string(UInt8, c))
string(c::MaybeSub{<:Str{<:UCS2_CSEs}}...) = Str(UCS2CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:Union{UCS2_CSEs,UTF16CSE}}}...) = Str(UTF16CSE, _string(UInt16, c))
string(c::MaybeSub{<:Str{<:UTF32_CSEs}}...) = Str(UTF32CSE, _string(UInt32, c))

# starts with and ends with predicates

starts_with(a::MaybeSub{<:Str{C}}, b::MaybeSub{<:Str{C}}) where {C<:CSE} =
Expand Down
25 changes: 25 additions & 0 deletions test/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,31 @@
#non-hex characters
@test_throws ArgumentError hex2bytes(b"0123456789abcdefABCDEFGH")
end

@testset "Concatenation" begin
asc = ASCIIStr("foo")
lat = LatinStr("bar")
ucs = UCS2Str("baz")
u32 = UTF32Str("silly")
ut8 = UTF8Str("test")
ut16 = UTF16Str("ugly")
haslat = _LatinStr("você")
hasucs = _UCS2Str("")
hasu32 = _UTF32Str("\U1f596")
@test typeof(asc * asc) == ASCIIStr
@test typeof(asc * lat) == LatinStr
@test typeof(asc * ut8) == UTF8Str
@test typeof(asc * haslat) == LatinStr
@test typeof(lat * lat) == LatinStr
@test typeof(haslat * haslat) == _LatinStr
@test typeof(lat * haslat) == LatinStr
@test typeof(ucs * ucs) == UCS2Str
@test typeof(hasucs * hasucs) == _UCS2Str
@test typeof(ucs * hasucs) == UCS2Str
@test typeof(u32 * u32) == UTF32Str
@test typeof(hasu32 * hasu32) == _UTF32Str
@test typeof(u32 * hasu32) == UTF32Str
end
end

# b"" should be immutable
Expand Down

0 comments on commit e797807

Please sign in to comment.