Add: julia-0.6.2
Former-commit-id: ccc667cf67d569f3fb3df39aa57c2134755a7551
This commit is contained in:
438
julia-0.6.2/share/julia/base/strings/string.jl
Normal file
438
julia-0.6.2/share/julia/base/strings/string.jl
Normal file
@@ -0,0 +1,438 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
const ByteArray = Union{Vector{UInt8},Vector{Int8}}
|
||||
|
||||
## constructors and conversions ##
|
||||
|
||||
# String constructor docstring from boot.jl, workaround for #16730
|
||||
# and the unavailability of @doc in boot.jl context.
|
||||
"""
|
||||
String(v::Vector{UInt8})
|
||||
|
||||
Create a new `String` from a vector `v` of bytes containing
|
||||
UTF-8 encoded characters. This function takes "ownership" of
|
||||
the array, which means that you should not subsequently modify
|
||||
`v` (since strings are supposed to be immutable in Julia) for
|
||||
as long as the string exists.
|
||||
|
||||
If you need to subsequently modify `v`, use `String(copy(v))` instead.
|
||||
"""
|
||||
function String(v::Array{UInt8,1})
|
||||
ccall(:jl_array_to_string, Ref{String}, (Any,), v)
|
||||
end
|
||||
|
||||
"""
|
||||
unsafe_string(p::Ptr{UInt8}, [length::Integer])
|
||||
|
||||
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
|
||||
(The pointer can be safely freed afterwards.) If `length` is specified
|
||||
(the length of the data in bytes), the string does not have to be NUL-terminated.
|
||||
|
||||
This function is labelled "unsafe" because it will crash if `p` is not
|
||||
a valid memory address to data of the requested length.
|
||||
"""
|
||||
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
|
||||
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
|
||||
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
|
||||
end
|
||||
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
|
||||
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
|
||||
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
|
||||
end
|
||||
|
||||
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
|
||||
|
||||
convert(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
|
||||
convert(::Type{String}, s::String) = s
|
||||
convert(::Type{String}, v::Vector{UInt8}) = String(v)
|
||||
|
||||
## low-level functions ##
|
||||
|
||||
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
|
||||
pointer(s::String, i::Integer) = pointer(s)+(i-1)
|
||||
|
||||
sizeof(s::String) = s.len
|
||||
|
||||
"""
|
||||
codeunit(s::AbstractString, i::Integer)
|
||||
|
||||
Get the `i`th code unit of an encoded string. For example,
|
||||
returns the `i`th byte of the representation of a UTF-8 string.
|
||||
"""
|
||||
codeunit(s::AbstractString, i::Integer)
|
||||
|
||||
@inline function codeunit(s::String, i::Integer)
|
||||
@boundscheck if (i < 1) | (i > s.len)
|
||||
throw(BoundsError(s,i))
|
||||
end
|
||||
unsafe_load(pointer(s),i)
|
||||
end
|
||||
|
||||
write(io::IO, s::String) = unsafe_write(io, pointer(s), reinterpret(UInt, s.len))
|
||||
|
||||
## comparison ##
|
||||
|
||||
function cmp(a::String, b::String)
|
||||
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
|
||||
a, b, min(a.len,b.len))
|
||||
return c < 0 ? -1 : c > 0 ? +1 : cmp(a.len,b.len)
|
||||
end
|
||||
|
||||
function ==(a::String, b::String)
|
||||
a.len == b.len && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, a.len)
|
||||
end
|
||||
|
||||
## prevind and nextind ##
|
||||
|
||||
function prevind(s::String, i::Integer)
|
||||
j = Int(i)
|
||||
e = s.len
|
||||
if j > e
|
||||
return endof(s)
|
||||
end
|
||||
j -= 1
|
||||
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
|
||||
j -= 1
|
||||
end
|
||||
j
|
||||
end
|
||||
|
||||
function nextind(s::String, i::Integer)
|
||||
j = Int(i)
|
||||
if j < 1
|
||||
return 1
|
||||
end
|
||||
e = s.len
|
||||
j += 1
|
||||
@inbounds while j <= e && is_valid_continuation(codeunit(s,j))
|
||||
j += 1
|
||||
end
|
||||
j
|
||||
end
|
||||
|
||||
## checking UTF-8 & ACSII validity ##
|
||||
|
||||
byte_string_classify(data::Vector{UInt8}) =
|
||||
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data))
|
||||
byte_string_classify(s::String) =
|
||||
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, s.len)
|
||||
# 0: neither valid ASCII nor UTF-8
|
||||
# 1: valid ASCII
|
||||
# 2: valid UTF-8
|
||||
|
||||
isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
|
||||
isvalid(s::String) = isvalid(String, s)
|
||||
|
||||
## basic UTF-8 decoding & iteration ##
|
||||
|
||||
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
|
||||
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
|
||||
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
|
||||
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
|
||||
|
||||
const utf8_offset = [
|
||||
0x00000000, 0x00003080,
|
||||
0x000e2080, 0x03c82080,
|
||||
0xfa082080, 0x82082080,
|
||||
]
|
||||
|
||||
const utf8_trailing = [
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
|
||||
]
|
||||
|
||||
## required core functionality ##
|
||||
|
||||
function endof(s::String)
|
||||
p = pointer(s)
|
||||
i = s.len
|
||||
while i > 0 && is_valid_continuation(unsafe_load(p,i))
|
||||
i -= 1
|
||||
end
|
||||
i
|
||||
end
|
||||
|
||||
function length(s::String)
|
||||
p = pointer(s)
|
||||
cnum = 0
|
||||
for i = 1:s.len
|
||||
cnum += !is_valid_continuation(unsafe_load(p,i))
|
||||
end
|
||||
cnum
|
||||
end
|
||||
|
||||
@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int)
|
||||
if is_valid_continuation(b)
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i)))
|
||||
end
|
||||
trailing = utf8_trailing[b + 1]
|
||||
if l < i + trailing
|
||||
return '\ufffd', i+1
|
||||
end
|
||||
c::UInt32 = 0
|
||||
for j = 1:(trailing + 1)
|
||||
c <<= 6
|
||||
c += unsafe_load(p,i)
|
||||
i += 1
|
||||
end
|
||||
c -= utf8_offset[trailing + 1]
|
||||
return Char(c), i
|
||||
end
|
||||
|
||||
# This implementation relies on `next` returning a value past the end of the
|
||||
# String's underlying data, which is true for valid Strings
|
||||
done(s::String, state) = state > s.len
|
||||
|
||||
@inline function next(s::String, i::Int)
|
||||
# function is split into this critical fast-path
|
||||
# for pure ascii data, such as parsing numbers,
|
||||
# and a longer function that can handle any utf8 data
|
||||
@boundscheck if (i < 1) | (i > s.len)
|
||||
throw(BoundsError(s,i))
|
||||
end
|
||||
p = pointer(s)
|
||||
b = unsafe_load(p, i)
|
||||
if b < 0x80
|
||||
return Char(b), i + 1
|
||||
end
|
||||
return slow_utf8_next(p, b, i, s.len)
|
||||
end
|
||||
|
||||
function first_utf8_byte(ch::Char)
|
||||
c = UInt32(ch)
|
||||
b = c < 0x80 ? c%UInt8 :
|
||||
c < 0x800 ? ((c>>6) | 0xc0)%UInt8 :
|
||||
c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 :
|
||||
((c>>18) | 0xf0)%UInt8
|
||||
return b
|
||||
end
|
||||
|
||||
function reverseind(s::String, i::Integer)
|
||||
j = s.len + 1 - i
|
||||
p = pointer(s)
|
||||
while is_valid_continuation(unsafe_load(p,j))
|
||||
j -= 1
|
||||
end
|
||||
return j
|
||||
end
|
||||
|
||||
## overload methods for efficiency ##
|
||||
|
||||
isvalid(s::String, i::Integer) =
|
||||
(1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i))
|
||||
|
||||
function getindex(s::String, r::UnitRange{Int})
|
||||
isempty(r) && return ""
|
||||
i, j = first(r), last(r)
|
||||
l = s.len
|
||||
if i < 1 || i > l
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
@inbounds si = codeunit(s, i)
|
||||
if is_valid_continuation(si)
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
|
||||
end
|
||||
if j > l
|
||||
throw(BoundsError())
|
||||
end
|
||||
j = nextind(s,j)-1
|
||||
unsafe_string(pointer(s,i), j-i+1)
|
||||
end
|
||||
|
||||
function search(s::String, c::Char, i::Integer = 1)
|
||||
if i < 1 || i > sizeof(s)
|
||||
i == sizeof(s) + 1 && return 0
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
if is_valid_continuation(codeunit(s,i))
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
|
||||
end
|
||||
c < Char(0x80) && return search(s, c%UInt8, i)
|
||||
while true
|
||||
i = search(s, first_utf8_byte(c), i)
|
||||
(i==0 || s[i] == c) && return i
|
||||
i = next(s,i)[2]
|
||||
end
|
||||
end
|
||||
|
||||
function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
|
||||
if i < 1
|
||||
throw(BoundsError(a, i))
|
||||
end
|
||||
n = sizeof(a)
|
||||
if i > n
|
||||
return i == n+1 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
p = pointer(a)
|
||||
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
|
||||
q == C_NULL ? 0 : Int(q-p+1)
|
||||
end
|
||||
|
||||
function search(a::ByteArray, b::Char, i::Integer = 1)
|
||||
if isascii(b)
|
||||
search(a,UInt8(b),i)
|
||||
else
|
||||
search(a,Vector{UInt8}(string(b)),i).start
|
||||
end
|
||||
end
|
||||
|
||||
function rsearch(s::String, c::Char, i::Integer = s.len)
|
||||
c < Char(0x80) && return rsearch(s, c%UInt8, i)
|
||||
b = first_utf8_byte(c)
|
||||
while true
|
||||
i = rsearch(s, b, i)
|
||||
(i==0 || s[i] == c) && return i
|
||||
i = prevind(s,i)
|
||||
end
|
||||
end
|
||||
|
||||
function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = s.len)
|
||||
if i < 1
|
||||
return i == 0 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
n = sizeof(a)
|
||||
if i > n
|
||||
return i == n+1 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
p = pointer(a)
|
||||
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
|
||||
q == C_NULL ? 0 : Int(q-p+1)
|
||||
end
|
||||
|
||||
function rsearch(a::ByteArray, b::Char, i::Integer = length(a))
|
||||
if isascii(b)
|
||||
rsearch(a,UInt8(b),i)
|
||||
else
|
||||
rsearch(a,Vector{UInt8}(string(b)),i).start
|
||||
end
|
||||
end
|
||||
|
||||
## optimized concatenation, reverse, repeat ##
|
||||
|
||||
function string(a::String...)
|
||||
if length(a) == 1
|
||||
return a[1]::String
|
||||
end
|
||||
n = 0
|
||||
for str in a
|
||||
n += str.len
|
||||
end
|
||||
out = _string_n(n)
|
||||
offs = 1
|
||||
for str in a
|
||||
unsafe_copy!(pointer(out,offs), pointer(str), str.len)
|
||||
offs += str.len
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
# UTF-8 encoding length of a character
|
||||
function codelen(d::Char)
|
||||
c = UInt32(d)
|
||||
if c < 0x80
|
||||
return 1
|
||||
elseif c < 0x800
|
||||
return 2
|
||||
elseif c < 0x10000
|
||||
return 3
|
||||
elseif c < 0x110000
|
||||
return 4
|
||||
end
|
||||
return 3 # '\ufffd'
|
||||
end
|
||||
|
||||
function string(a::Union{String,Char}...)
|
||||
n = 0
|
||||
for d in a
|
||||
if isa(d,Char)
|
||||
n += codelen(d::Char)
|
||||
else
|
||||
n += (d::String).len
|
||||
end
|
||||
end
|
||||
out = _string_n(n)
|
||||
offs = 1
|
||||
p = pointer(out)
|
||||
for d in a
|
||||
if isa(d,Char)
|
||||
c = UInt32(d::Char)
|
||||
if c < 0x80
|
||||
unsafe_store!(p, c%UInt8, offs); offs += 1
|
||||
elseif c < 0x800
|
||||
unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
elseif c < 0x10000
|
||||
unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
elseif c < 0x110000
|
||||
unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
else
|
||||
# '\ufffd'
|
||||
unsafe_store!(p, 0xef, offs); offs += 1
|
||||
unsafe_store!(p, 0xbf, offs); offs += 1
|
||||
unsafe_store!(p, 0xbd, offs); offs += 1
|
||||
end
|
||||
else
|
||||
l = (d::String).len
|
||||
unsafe_copy!(pointer(out,offs), pointer(d::String), l)
|
||||
offs += l
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function reverse(s::String)
|
||||
dat = convert(Vector{UInt8},s)
|
||||
n = length(dat)
|
||||
n <= 1 && return s
|
||||
buf = StringVector(n)
|
||||
out = n
|
||||
pos = 1
|
||||
@inbounds while out > 0
|
||||
ch = dat[pos]
|
||||
if ch > 0xdf
|
||||
if ch < 0xf0
|
||||
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
|
||||
pos += 3
|
||||
else
|
||||
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
|
||||
pos += 4
|
||||
end
|
||||
elseif ch > 0x7f
|
||||
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
|
||||
pos += 2
|
||||
else
|
||||
buf[out] = ch
|
||||
out -= 1
|
||||
pos += 1
|
||||
end
|
||||
end
|
||||
String(buf)
|
||||
end
|
||||
|
||||
function repeat(s::String, r::Integer)
|
||||
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
|
||||
n = s.len
|
||||
out = _string_n(n*r)
|
||||
if n == 1 # common case: repeating a single ASCII char
|
||||
ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, unsafe_load(pointer(s)), r)
|
||||
else
|
||||
for i=1:r
|
||||
unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n)
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
Reference in New Issue
Block a user