Add: julia-0.6.2

Former-commit-id: ccc667cf67d569f3fb3df39aa57c2134755a7551
This commit is contained in:
2018-02-10 10:27:19 -07:00
parent 94220957d7
commit 019f8e3064
723 changed files with 276164 additions and 0 deletions

View File

@@ -0,0 +1,500 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
## core string functions ##
endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")")
next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)")
next(s::DirectIndexString, i::Int) = (s[i],i+1)
next(s::AbstractString, i::Integer) = next(s,Int(i))
string() = ""
string(s::AbstractString) = s
"""
String(s::AbstractString)
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes.
This representation is often appropriate for passing strings to C.
"""
String(s::AbstractString) = print_to_string(s)
convert(::Type{Vector{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, String(s))
convert(::Type{Array{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, s)
convert(::Type{String}, s::AbstractString) = String(s)
convert(::Type{Vector{Char}}, s::AbstractString) = collect(s)
convert(::Type{Symbol}, s::AbstractString) = Symbol(s)
convert(::Type{String}, s::Symbol) = unsafe_string(Cstring(s))
## generic supplied functions ##
start(s::AbstractString) = 1
done(s::AbstractString,i) = (i > endof(s))
getindex(s::AbstractString, i::Int) = next(s,i)[1]
getindex(s::AbstractString, i::Integer) = s[Int(i)]
getindex(s::AbstractString, i::Colon) = s
getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
# TODO: handle other ranges with stride ±1 specially?
getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
sprint(length(v), io->(for i in v; write(io,s[i]) end))
getindex(s::AbstractString, v::AbstractVector{Bool}) =
throw(ArgumentError("logical indexing not supported for strings"))
Symbol(s::AbstractString) = Symbol(String(s))
"""
sizeof(s::AbstractString)
The number of bytes in string `s`.
# Example
```jldoctest
julia> sizeof("")
3
```
"""
sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation")
eltype(::Type{<:AbstractString}) = Char
"""
```
*(s::AbstractString, t::AbstractString)
```
Concatenate strings. The `*` operator is an alias to this function.
# Example
```jldoctest
julia> "Hello " * "world"
"Hello world"
```
"""
(*)(s1::AbstractString, ss::AbstractString...) = string(s1, ss...)
one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
length(s::DirectIndexString) = endof(s)
"""
length(s::AbstractString)
The number of characters in string `s`.
# Example
```jldoctest
julia> length("jμΛIα")
5
```
"""
function length(s::AbstractString)
i = start(s)
if done(s,i)
return 0
end
n = 1
while true
c, j = next(s,i)
if done(s,j)
return n
end
n += 1
i = j
end
end
## string comparison functions ##
function cmp(a::AbstractString, b::AbstractString)
if a === b
return 0
end
i = start(a)
j = start(b)
while !done(a,i)
if done(b,j)
return +1
end
c, i = next(a,i)
d, j = next(b,j)
if c != d
return c < d ? -1 : +1
end
end
done(b,j) ? 0 : -1
end
==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0
isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0
# faster comparisons for symbols
cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b)))
isless(a::Symbol, b::Symbol) = cmp(a,b) < 0
## Generic validation functions ##
isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= endof(s))
"""
isvalid(str::AbstractString, i::Integer)
Tells whether index `i` is valid for the given string.
# Examples
```jldoctest
julia> str = "αβγdef";
julia> isvalid(str, 1)
true
julia> str[1]
'α': Unicode U+03b1 (category Ll: Letter, lowercase)
julia> isvalid(str, 2)
false
julia> str[2]
ERROR: UnicodeError: invalid character index
[...]
```
"""
function isvalid(s::AbstractString, i::Integer)
i < 1 && return false
done(s,i) && return false
try
next(s,i)
true
catch
false
end
end
## Generic indexing functions ##
prevind(s::DirectIndexString, i::Integer) = Int(i)-1
prevind(s::AbstractArray , i::Integer) = Int(i)-1
nextind(s::DirectIndexString, i::Integer) = Int(i)+1
nextind(s::AbstractArray , i::Integer) = Int(i)+1
"""
prevind(str::AbstractString, i::Integer)
Get the previous valid string index before `i`.
Returns a value less than `1` at the beginning of the string.
# Examples
```jldoctest
julia> prevind("αβγdef", 3)
1
julia> prevind("αβγdef", 1)
0
```
"""
function prevind(s::AbstractString, i::Integer)
e = endof(s)
if i > e
return e
end
j = Int(i)-1
while j >= 1
if isvalid(s,j)
return j
end
j -= 1
end
return 0 # out of range
end
"""
nextind(str::AbstractString, i::Integer)
Get the next valid string index after `i`.
Returns a value greater than `endof(str)` at or after the end of the string.
# Examples
```jldoctest
julia> str = "αβγdef";
julia> nextind(str, 1)
3
julia> endof(str)
9
julia> nextind(str, 9)
10
```
"""
function nextind(s::AbstractString, i::Integer)
e = endof(s)
if i < 1
return 1
end
if i > e
return Int(i)+1
end
for j = Int(i)+1:e
if isvalid(s,j)
return j
end
end
next(s,e)[2] # out of range
end
checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i))
checkbounds(s::AbstractString, r::Range{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r))
# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer
checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I)
checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I)
ind2chr(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end
chr2ind(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end
"""
ind2chr(s::AbstractString, i::Integer)
Convert a byte index `i` to a character index with
respect to string `s`.
See also [`chr2ind`](@ref).
# Example
```jldoctest
julia> str = "αβγdef";
julia> ind2chr(str, 3)
2
julia> chr2ind(str, 2)
3
```
"""
function ind2chr(s::AbstractString, i::Integer)
s[i] # throws error if invalid
j = 1
k = start(s)
while true
c, l = next(s,k)
if i <= k
return j
end
j += 1
k = l
end
end
"""
chr2ind(s::AbstractString, i::Integer)
Convert a character index `i` to a byte index.
See also [`ind2chr`](@ref).
# Example
```jldoctest
julia> str = "αβγdef";
julia> chr2ind(str, 2)
3
julia> ind2chr(str, 3)
2
```
"""
function chr2ind(s::AbstractString, i::Integer)
i < start(s) && throw(BoundsError(s, i))
j = 1
k = start(s)
while true
c, l = next(s,k)
if i == j
return k
end
j += 1
k = l
end
end
struct EachStringIndex{T<:AbstractString}
s::T
end
eachindex(s::AbstractString) = EachStringIndex(s)
length(e::EachStringIndex) = length(e.s)
start(e::EachStringIndex) = start(e.s)
next(e::EachStringIndex, state) = (state, nextind(e.s, state))
done(e::EachStringIndex, state) = done(e.s, state)
eltype(::Type{EachStringIndex}) = Int
## character column width function ##
"""
strwidth(s::AbstractString)
Gives the number of columns needed to print a string.
# Example
```jldoctest
julia> strwidth("March")
5
```
"""
strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
"""
isascii(c::Union{Char,AbstractString}) -> Bool
Tests whether a character belongs to the ASCII character set, or whether this is true for
all elements of a string.
"""
isascii(c::Char) = c < Char(0x80)
isascii(s::AbstractString) = all(isascii, s)
## string promotion rules ##
promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
"""
isxdigit(c::Char) -> Bool
Tests whether a character is a valid hexadecimal digit. Note that this does not
include `x` (as in the standard `0x` prefix).
# Example
```jldoctest
julia> isxdigit('a')
true
julia> isxdigit('x')
false
```
"""
isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F'
## uppercase, lowercase, and titlecase transformations ##
"""
uppercase(s::AbstractString)
Returns `s` with all characters converted to uppercase.
# Example
```jldoctest
julia> uppercase("Julia")
"JULIA"
```
"""
uppercase(s::AbstractString) = map(uppercase, s)
"""
lowercase(s::AbstractString)
Returns `s` with all characters converted to lowercase.
# Example
```jldoctest
julia> lowercase("STRINGS AND THINGS")
"strings and things"
```
"""
lowercase(s::AbstractString) = map(lowercase, s)
"""
titlecase(s::AbstractString)
Capitalizes the first character of each word in `s`.
# Example
```jldoctest
julia> titlecase("the julia programming language")
"The Julia Programming Language"
```
"""
function titlecase(s::AbstractString)
startword = true
b = IOBuffer()
for c in s
if isspace(c)
print(b, c)
startword = true
else
print(b, startword ? titlecase(c) : c)
startword = false
end
end
return String(take!(b))
end
"""
ucfirst(s::AbstractString)
Returns `string` with the first character converted to uppercase.
# Example
```jldoctest
julia> ucfirst("python")
"Python"
```
"""
function ucfirst(s::AbstractString)
isempty(s) || isupper(s[1]) ? s : string(uppercase(s[1]),s[nextind(s,1):end])
end
"""
lcfirst(s::AbstractString)
Returns `string` with the first character converted to lowercase.
# Example
```jldoctest
julia> lcfirst("Julia")
"julia"
```
"""
function lcfirst(s::AbstractString)
isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end])
end
## string map, filter, has ##
function map(f, s::AbstractString)
out = IOBuffer(StringVector(endof(s)),true,true)
truncate(out,0)
for c in s
c2 = f(c)
if !isa(c2,Char)
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
end
write(out, c2::Char)
end
String(take!(out))
end
function filter(f, s::AbstractString)
out = IOBuffer(StringVector(endof(s)),true,true)
truncate(out,0)
for c in s
if f(c)
write(out, c)
end
end
String(take!(out))
end

View File

@@ -0,0 +1,15 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
## Error messages for Unicode / UTF support
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
const UTF_ERR_INVALID_INDEX = "invalid character index <<1>> (0x<<2>> is a continuation byte)"
mutable struct UnicodeError <: Exception
errmsg::AbstractString ##< A UTF_ERR_ message
errpos::Int32 ##< Position of invalid character
errchr::UInt32 ##< Invalid character
end
show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))

View File

@@ -0,0 +1,451 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
## core text I/O ##
"""
print(io::IO, x)
Write to `io` (or to the default output stream [`STDOUT`](@ref)
if `io` is not given) a canonical (un-decorated) text representation
of a value if there is one, otherwise call [`show`](@ref).
The representation used by `print` includes minimal formatting and tries to
avoid Julia-specific details.
```jldoctest
julia> print("Hello World!")
Hello World!
julia> io = IOBuffer();
julia> print(io, "Hello World!")
julia> String(take!(io))
"Hello World!"
```
"""
function print(io::IO, x)
lock(io)
try
show(io, x)
finally
unlock(io)
end
return nothing
end
function print(io::IO, xs...)
lock(io)
try
for x in xs
print(io, x)
end
finally
unlock(io)
end
return nothing
end
"""
println(io::IO, xs...)
Print (using [`print`](@ref)) `xs` followed by a newline.
If `io` is not supplied, prints to [`STDOUT`](@ref).
"""
println(io::IO, xs...) = print(io, xs..., '\n')
## conversion of general objects to strings ##
function sprint(size::Integer, f::Function, args...; env=nothing)
s = IOBuffer(StringVector(size), true, true)
# specialized version of truncate(s,0)
s.size = 0
s.ptr = 1
if env !== nothing
f(IOContext(s, env), args...)
else
f(s, args...)
end
String(resize!(s.data, s.size))
end
"""
sprint(f::Function, args...)
Call the given function with an I/O stream and the supplied extra arguments.
Everything written to this I/O stream is returned as a string.
```jldoctest
julia> sprint(showcompact, 66.66666)
"66.6667"
```
"""
sprint(f::Function, args...) = sprint(0, f, args...)
tostr_sizehint(x) = 0
tostr_sizehint(x::AbstractString) = endof(x)
tostr_sizehint(x::Float64) = 20
tostr_sizehint(x::Float32) = 12
function print_to_string(xs...; env=nothing)
# specialized for performance reasons
s = IOBuffer(StringVector(tostr_sizehint(xs[1])), true, true)
# specialized version of truncate(s,0)
s.size = 0
s.ptr = 1
if env !== nothing
env_io = IOContext(s, env)
for x in xs
print(env_io, x)
end
else
for x in xs
print(s, x)
end
end
String(resize!(s.data, s.size))
end
string_with_env(env, xs...) = print_to_string(xs...; env=env)
"""
string(xs...)
Create a string from any values using the [`print`](@ref) function.
```jldoctest
julia> string("a", 1, true)
"a1true"
```
"""
string(xs...) = print_to_string(xs...)
print(io::IO, s::AbstractString) = (write(io, s); nothing)
write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); end; len)
show(io::IO, s::AbstractString) = print_quoted(io, s)
write(to::AbstractIOBuffer, s::SubString{String}) =
s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1))
## printing literal quoted string data ##
# this is the inverse of print_unescaped_chars(io, s, "\\\")
function print_quoted_literal(io, s::AbstractString)
print(io, '"')
for c = s; c == '"' ? print(io, "\\\"") : print(io, c); end
print(io, '"')
end
"""
repr(x)
Create a string from any value using the [`showall`](@ref) function.
"""
function repr(x)
s = IOBuffer()
showall(s, x)
String(take!(s))
end
# IOBuffer views of a (byte)string:
"""
IOBuffer(string::String)
Create a read-only `IOBuffer` on the data underlying the given string.
```jldoctest
julia> io = IOBuffer("Haho");
julia> String(take!(io))
"Haho"
julia> String(take!(io))
"Haho"
```
"""
IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str))
IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s)))
# join is implemented using IO
"""
join(io::IO, strings, delim, [last])
Join an array of `strings` into a single string, inserting the given delimiter between
adjacent strings. If `last` is given, it will be used instead of `delim` between the last
two strings. For example,
```jldoctest
julia> join(["apples", "bananas", "pineapples"], ", ", " and ")
"apples, bananas and pineapples"
```
`strings` can be any iterable over elements `x` which are convertible to strings
via `print(io::IOBuffer, x)`. `strings` will be printed to `io`.
"""
function join(io::IO, strings, delim, last)
i = start(strings)
if done(strings,i)
return
end
str, i = next(strings,i)
print(io, str)
is_done = done(strings,i)
while !is_done
str, i = next(strings,i)
is_done = done(strings,i)
print(io, is_done ? last : delim)
print(io, str)
end
end
function join(io::IO, strings, delim)
i = start(strings)
is_done = done(strings,i)
while !is_done
str, i = next(strings,i)
is_done = done(strings,i)
print(io, str)
if !is_done
print(io, delim)
end
end
end
join(io::IO, strings) = join(io, strings, "")
join(args...) = sprint(join, args...)
## string escaping & unescaping ##
need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1])
escape_nul(s::AbstractString, i::Int) =
!done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0"
"""
escape_string([io,] str::AbstractString[, esc::AbstractString]) -> AbstractString
General escaping of traditional C and Unicode escape sequences.
Any characters in `esc` are also escaped (with a backslash).
See also [`unescape_string`](@ref).
"""
function escape_string(io, s::AbstractString, esc::AbstractString)
i = start(s)
while !done(s,i)
c, j = next(s,i)
c == '\0' ? print(io, escape_nul(s,j)) :
c == '\e' ? print(io, "\\e") :
c == '\\' ? print(io, "\\\\") :
c in esc ? print(io, '\\', c) :
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
isprint(c) ? print(io, c) :
c <= '\x7f' ? print(io, "\\x", hex(c, 2)) :
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
i = j
end
end
escape_string(s::AbstractString) = sprint(endof(s), escape_string, s, "\"")
function print_quoted(io, s::AbstractString)
print(io, '"')
escape_string(io, s, "\"\$") #"# work around syntax highlighting problem
print(io, '"')
end
# bare minimum unescaping function unescapes only given characters
function print_unescaped_chars(io, s::AbstractString, esc::AbstractString)
if !('\\' in esc)
esc = string("\\", esc)
end
i = start(s)
while !done(s,i)
c, i = next(s,i)
if c == '\\' && !done(s,i) && s[i] in esc
c, i = next(s,i)
end
print(io, c)
end
end
unescape_chars(s::AbstractString, esc::AbstractString) =
sprint(endof(s), print_unescaped_chars, s, esc)
# general unescaping of traditional C and Unicode escape sequences
"""
unescape_string([io,] s::AbstractString) -> AbstractString
General unescaping of traditional C and Unicode escape sequences. Reverse of
[`escape_string`](@ref).
"""
function unescape_string(io, s::AbstractString)
i = start(s)
while !done(s,i)
c, i = next(s,i)
if !done(s,i) && c == '\\'
c, i = next(s,i)
if c == 'x' || c == 'u' || c == 'U'
n = k = 0
m = c == 'x' ? 2 :
c == 'u' ? 4 : 8
while (k+=1) <= m && !done(s,i)
c, j = next(s,i)
n = '0' <= c <= '9' ? n<<4 + c-'0' :
'a' <= c <= 'f' ? n<<4 + c-'a'+10 :
'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break
i = j
end
if k == 1
throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
"unicode (\\u)") escape sequence used in $(repr(s))"))
end
if m == 2 # \x escape sequence
write(io, UInt8(n))
else
print(io, Char(n))
end
elseif '0' <= c <= '7'
k = 1
n = c-'0'
while (k+=1) <= 3 && !done(s,i)
c, j = next(s,i)
n = ('0' <= c <= '7') ? n<<3 + c-'0' : break
i = j
end
if n > 255
throw(ArgumentError("octal escape sequence out of range"))
end
write(io, UInt8(n))
else
print(io, c == 'a' ? '\a' :
c == 'b' ? '\b' :
c == 't' ? '\t' :
c == 'n' ? '\n' :
c == 'v' ? '\v' :
c == 'f' ? '\f' :
c == 'r' ? '\r' :
c == 'e' ? '\e' : c)
end
else
print(io, c)
end
end
end
unescape_string(s::AbstractString) = sprint(endof(s), unescape_string, s)
macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end
macro raw_str(s); s; end
## multiline strings ##
"""
Calculate the width of leading blank space, and also return if string is blank
Returns:
* width of leading whitespace, flag if string is totally blank
"""
function indentation(str::AbstractString; tabwidth=8)
count = 0
for ch in str
if ch == ' '
count += 1
elseif ch == '\t'
count = div(count + tabwidth, tabwidth) * tabwidth
else
return count, false
end
end
count, true
end
"""
Removes leading indentation from string
Returns:
* `String` of multiline string, with leading indentation of `indent` removed
"""
function unindent(str::AbstractString, indent::Int; tabwidth=8)
indent == 0 && return str
pos = start(str)
endpos = endof(str)
# Note: this loses the type of the original string
buf = IOBuffer(StringVector(endpos), true, true)
truncate(buf,0)
cutting = true
col = 0 # current column (0 based)
while pos <= endpos
ch, pos = next(str,pos)
if cutting
if ch == ' '
col += 1
elseif ch == '\t'
col = div(col + tabwidth, tabwidth) * tabwidth
elseif ch == '\n'
# Now we need to output enough indentation
for i = 1:col-indent
write(buf, ' ')
end
col = 0
write(buf, '\n')
else
cutting = false
# Now we need to output enough indentation to get to
# correct place
for i = 1:col-indent
write(buf, ' ')
end
col += 1
write(buf, ch)
end
elseif ch == '\t' # Handle internal tabs
upd = div(col + tabwidth, tabwidth) * tabwidth
# output the number of spaces that would have been seen
# with original indentation
for i = 1:(upd-col)
write(buf, ' ')
end
col = upd
elseif ch == '\n'
cutting = true
col = 0
write(buf, '\n')
else
col += 1
write(buf, ch)
end
end
# If we were still "cutting" when we hit the end of the string,
# we need to output the right number of spaces for the indentation
if cutting
for i = 1:col-indent
write(buf, ' ')
end
end
String(take!(buf))
end
function convert(::Type{String}, chars::AbstractVector{Char})
sprint(length(chars), io->begin
state = start(chars)
while !done(chars, state)
c, state = next(chars, state)
if '\ud7ff' < c && c + 1024 < '\ue000'
d, state = next(chars, state)
if '\ud7ff' < d - 1024 && d < '\ue000'
c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
else
write(io, c)
c = d
end
end
write(io, c)
end
end)
end

View File

@@ -0,0 +1,380 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}}
"""
search(string::AbstractString, chars::Chars, [start::Integer])
Search for the first occurrence of the given characters within the given string. The second
argument may be a single character, a vector or a set of characters, a string, or a regular
expression (though regular expressions are only allowed on contiguous strings, such as ASCII
or UTF-8 strings). The third argument optionally specifies a starting index. The return
value is a range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`:
`search(string, "substring")` = `start:end` such that `string[start:end] == "substring"`, or
`0:-1` if unmatched.
`search(string, 'c')` = `index` such that `string[index] == 'c'`, or `0` if unmatched.
```jldoctest
julia> search("Hello to the world", "z")
0:-1
julia> search("JuliaLang","Julia")
1:5
```
"""
function search(s::AbstractString, c::Chars, i::Integer)
if isempty(c)
return 1 <= i <= nextind(s,endof(s)) ? i :
throw(BoundsError(s, i))
end
if i < 1 || i > nextind(s,endof(s))
throw(BoundsError(s, i))
end
while !done(s,i)
d, j = next(s,i)
if d in c
return i
end
i = j
end
return 0
end
search(s::AbstractString, c::Chars) = search(s,c,start(s))
in(c::Char, s::AbstractString) = (search(s,c)!=0)
function _searchindex(s, t, i)
if isempty(t)
return 1 <= i <= nextind(s,endof(s)) ? i :
throw(BoundsError(s, i))
end
t1, j2 = next(t,start(t))
while true
i = search(s,t1,i)
if i == 0 return 0 end
c, ii = next(s,i)
j = j2; k = ii
matched = true
while !done(t,j)
if done(s,k)
matched = false
break
end
c, k = next(s,k)
d, j = next(t,j)
if c != d
matched = false
break
end
end
if matched
return i
end
i = ii
end
end
function _search_bloom_mask(c)
UInt64(1) << (c & 63)
end
_nthbyte(s::String, i) = codeunit(s, i)
_nthbyte(a::ByteArray, i) = a[i]
function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i)
n = sizeof(t)
m = sizeof(s)
if n == 0
return 1 <= i <= m+1 ? max(1, i) : 0
elseif m == 0
return 0
elseif n == 1
return search(s, _nthbyte(t,1), i)
end
w = m - n
if w < 0 || i - 1 > w
return 0
end
bloom_mask = UInt64(0)
skip = n - 1
tlast = _nthbyte(t,n)
for j in 1:n
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
if _nthbyte(t,j) == tlast && j < n
skip = n - j - 1
end
end
i -= 1
while i <= w
if _nthbyte(s,i+n) == tlast
# check candidate
j = 0
while j < n - 1
if _nthbyte(s,i+j+1) != _nthbyte(t,j+1)
break
end
j += 1
end
# match found
if j == n - 1
return i+1
end
# no match, try to rule out the next character
if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
i += n
else
i += skip
end
elseif i < w
if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
i += n
end
end
i += 1
end
0
end
searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i)
"""
searchindex(s::AbstractString, substring, [start::Integer])
Similar to [`search`](@ref), but return only the start index at which
the substring is found, or `0` if it is not.
```jldoctest
julia> searchindex("Hello to the world", "z")
0
julia> searchindex("JuliaLang","Julia")
1
julia> searchindex("JuliaLang","Lang")
6
```
"""
searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i)
searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s))
searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i)
searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s))
function searchindex(s::String, t::String, i::Integer=1)
# Check for fast case of a single byte
# (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead)
if endof(t) == 1
search(s, t[1], i)
else
_searchindex(s, t, i)
end
end
function _search(s, t, i::Integer)
idx = searchindex(s,t,i)
if isempty(t)
idx:idx-1
else
idx:(idx > 0 ? idx + endof(t) - 1 : -1)
end
end
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
function rsearch(s::AbstractString, c::Chars)
j = search(RevString(s), c)
j == 0 && return 0
endof(s)-j+1
end
"""
rsearch(s::AbstractString, chars::Chars, [start::Integer])
Similar to [`search`](@ref), but returning the last occurrence of the given characters within the
given string, searching in reverse from `start`.
```jldoctest
julia> rsearch("aaabbb","b")
6:6
```
"""
function rsearch(s::AbstractString, c::Chars, i::Integer)
e = endof(s)
j = search(RevString(s), c, e-i+1)
j == 0 && return 0
e-j+1
end
function _rsearchindex(s, t, i)
if isempty(t)
return 1 <= i <= nextind(s,endof(s)) ? i :
throw(BoundsError(s, i))
end
t = RevString(t)
rs = RevString(s)
l = endof(s)
t1, j2 = next(t,start(t))
while true
i = rsearch(s,t1,i)
if i == 0 return 0 end
c, ii = next(rs,l-i+1)
j = j2; k = ii
matched = true
while !done(t,j)
if done(rs,k)
matched = false
break
end
c, k = next(rs,k)
d, j = next(t,j)
if c != d
matched = false
break
end
end
if matched
return nextind(s,l-k+1)
end
i = l-ii+1
end
end
function _rsearchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, k)
n = sizeof(t)
m = sizeof(s)
if n == 0
return 0 <= k <= m ? max(k, 1) : 0
elseif m == 0
return 0
elseif n == 1
return rsearch(s, _nthbyte(t,1), k)
end
w = m - n
if w < 0 || k <= 0
return 0
end
bloom_mask = UInt64(0)
skip = n - 1
tfirst = _nthbyte(t,1)
for j in n:-1:1
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
if _nthbyte(t,j) == tfirst && j > 1
skip = j - 2
end
end
i = min(k - n + 1, w + 1)
while i > 0
if _nthbyte(s,i) == tfirst
# check candidate
j = 1
while j < n
if _nthbyte(s,i+j) != _nthbyte(t,j+1)
break
end
j += 1
end
# match found
if j == n
return i
end
# no match, try to rule out the next character
if i > 1 && bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
i -= n
else
i -= skip
end
elseif i > 1
if bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
i -= n
end
end
i -= 1
end
0
end
rsearchindex(s::ByteArray, t::ByteArray, i::Integer) = _rsearchindex(s,t,i)
"""
rsearchindex(s::AbstractString, substring, [start::Integer])
Similar to [`rsearch`](@ref), but return only the start index at which the substring is found, or `0` if it is not.
```jldoctest
julia> rsearchindex("aaabbb","b")
6
julia> rsearchindex("aaabbb","a")
3
```
"""
rsearchindex(s::AbstractString, t::AbstractString, i::Integer) = _rsearchindex(s,t,i)
rsearchindex(s::AbstractString, t::AbstractString) = (isempty(s) && isempty(t)) ? 1 : rsearchindex(s,t,endof(s))
function rsearchindex(s::String, t::String)
# Check for fast case of a single byte
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
if endof(t) == 1
rsearch(s, t[1])
else
_rsearchindex(s, t, sizeof(s))
end
end
function rsearchindex(s::String, t::String, i::Integer)
# Check for fast case of a single byte
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
if endof(t) == 1
rsearch(s, t[1], i)
elseif endof(t) != 0
_rsearchindex(s, t, nextind(s, i)-1)
elseif i > sizeof(s)
return 0
elseif i == 0
return 1
else
return i
end
end
function _rsearch(s, t, i::Integer)
idx = rsearchindex(s,t,i)
if isempty(t)
idx:idx-1
else
idx:(idx > 0 ? idx + endof(t) - 1 : -1)
end
end
rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) = _rsearch(s, t, i)
rsearch(s::ByteArray, t::ByteArray, i::Integer=endof(s)) = _rsearch(s, t, i)
"""
contains(haystack::AbstractString, needle::AbstractString)
Determine whether the second argument is a substring of the first.
```jldoctest
julia> contains("JuliaLang is pretty cool!", "Julia")
true
```
"""
contains(haystack::AbstractString, needle::AbstractString) = searchindex(haystack,needle)!=0
in(::AbstractString, ::AbstractString) = error("use contains(x,y) for string containment")

View File

@@ -0,0 +1,438 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
const ByteArray = Union{Vector{UInt8},Vector{Int8}}
## constructors and conversions ##
# String constructor docstring from boot.jl, workaround for #16730
# and the unavailability of @doc in boot.jl context.
"""
String(v::Vector{UInt8})
Create a new `String` from a vector `v` of bytes containing
UTF-8 encoded characters. This function takes "ownership" of
the array, which means that you should not subsequently modify
`v` (since strings are supposed to be immutable in Julia) for
as long as the string exists.
If you need to subsequently modify `v`, use `String(copy(v))` instead.
"""
function String(v::Array{UInt8,1})
ccall(:jl_array_to_string, Ref{String}, (Any,), v)
end
"""
unsafe_string(p::Ptr{UInt8}, [length::Integer])
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
(The pointer can be safely freed afterwards.) If `length` is specified
(the length of the data in bytes), the string does not have to be NUL-terminated.
This function is labelled "unsafe" because it will crash if `p` is not
a valid memory address to data of the requested length.
"""
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
end
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
end
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
convert(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
convert(::Type{String}, s::String) = s
convert(::Type{String}, v::Vector{UInt8}) = String(v)
## low-level functions ##
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
pointer(s::String, i::Integer) = pointer(s)+(i-1)
sizeof(s::String) = s.len
"""
codeunit(s::AbstractString, i::Integer)
Get the `i`th code unit of an encoded string. For example,
returns the `i`th byte of the representation of a UTF-8 string.
"""
codeunit(s::AbstractString, i::Integer)
@inline function codeunit(s::String, i::Integer)
@boundscheck if (i < 1) | (i > s.len)
throw(BoundsError(s,i))
end
unsafe_load(pointer(s),i)
end
write(io::IO, s::String) = unsafe_write(io, pointer(s), reinterpret(UInt, s.len))
## comparison ##
function cmp(a::String, b::String)
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
a, b, min(a.len,b.len))
return c < 0 ? -1 : c > 0 ? +1 : cmp(a.len,b.len)
end
function ==(a::String, b::String)
a.len == b.len && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, a.len)
end
## prevind and nextind ##
function prevind(s::String, i::Integer)
j = Int(i)
e = s.len
if j > e
return endof(s)
end
j -= 1
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
j -= 1
end
j
end
function nextind(s::String, i::Integer)
j = Int(i)
if j < 1
return 1
end
e = s.len
j += 1
@inbounds while j <= e && is_valid_continuation(codeunit(s,j))
j += 1
end
j
end
## checking UTF-8 & ACSII validity ##
byte_string_classify(data::Vector{UInt8}) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data))
byte_string_classify(s::String) =
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, s.len)
# 0: neither valid ASCII nor UTF-8
# 1: valid ASCII
# 2: valid UTF-8
isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
isvalid(s::String) = isvalid(String, s)
## basic UTF-8 decoding & iteration ##
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
const utf8_offset = [
0x00000000, 0x00003080,
0x000e2080, 0x03c82080,
0xfa082080, 0x82082080,
]
const utf8_trailing = [
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
]
## required core functionality ##
function endof(s::String)
p = pointer(s)
i = s.len
while i > 0 && is_valid_continuation(unsafe_load(p,i))
i -= 1
end
i
end
function length(s::String)
p = pointer(s)
cnum = 0
for i = 1:s.len
cnum += !is_valid_continuation(unsafe_load(p,i))
end
cnum
end
@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int)
if is_valid_continuation(b)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i)))
end
trailing = utf8_trailing[b + 1]
if l < i + trailing
return '\ufffd', i+1
end
c::UInt32 = 0
for j = 1:(trailing + 1)
c <<= 6
c += unsafe_load(p,i)
i += 1
end
c -= utf8_offset[trailing + 1]
return Char(c), i
end
# This implementation relies on `next` returning a value past the end of the
# String's underlying data, which is true for valid Strings
done(s::String, state) = state > s.len
@inline function next(s::String, i::Int)
# function is split into this critical fast-path
# for pure ascii data, such as parsing numbers,
# and a longer function that can handle any utf8 data
@boundscheck if (i < 1) | (i > s.len)
throw(BoundsError(s,i))
end
p = pointer(s)
b = unsafe_load(p, i)
if b < 0x80
return Char(b), i + 1
end
return slow_utf8_next(p, b, i, s.len)
end
function first_utf8_byte(ch::Char)
c = UInt32(ch)
b = c < 0x80 ? c%UInt8 :
c < 0x800 ? ((c>>6) | 0xc0)%UInt8 :
c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 :
((c>>18) | 0xf0)%UInt8
return b
end
function reverseind(s::String, i::Integer)
j = s.len + 1 - i
p = pointer(s)
while is_valid_continuation(unsafe_load(p,j))
j -= 1
end
return j
end
## overload methods for efficiency ##
isvalid(s::String, i::Integer) =
(1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i))
function getindex(s::String, r::UnitRange{Int})
isempty(r) && return ""
i, j = first(r), last(r)
l = s.len
if i < 1 || i > l
throw(BoundsError(s, i))
end
@inbounds si = codeunit(s, i)
if is_valid_continuation(si)
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
end
if j > l
throw(BoundsError())
end
j = nextind(s,j)-1
unsafe_string(pointer(s,i), j-i+1)
end
function search(s::String, c::Char, i::Integer = 1)
if i < 1 || i > sizeof(s)
i == sizeof(s) + 1 && return 0
throw(BoundsError(s, i))
end
if is_valid_continuation(codeunit(s,i))
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
end
c < Char(0x80) && return search(s, c%UInt8, i)
while true
i = search(s, first_utf8_byte(c), i)
(i==0 || s[i] == c) && return i
i = next(s,i)[2]
end
end
function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
if i < 1
throw(BoundsError(a, i))
end
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
p = pointer(a)
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
q == C_NULL ? 0 : Int(q-p+1)
end
function search(a::ByteArray, b::Char, i::Integer = 1)
if isascii(b)
search(a,UInt8(b),i)
else
search(a,Vector{UInt8}(string(b)),i).start
end
end
function rsearch(s::String, c::Char, i::Integer = s.len)
c < Char(0x80) && return rsearch(s, c%UInt8, i)
b = first_utf8_byte(c)
while true
i = rsearch(s, b, i)
(i==0 || s[i] == c) && return i
i = prevind(s,i)
end
end
function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = s.len)
if i < 1
return i == 0 ? 0 : throw(BoundsError(a, i))
end
n = sizeof(a)
if i > n
return i == n+1 ? 0 : throw(BoundsError(a, i))
end
p = pointer(a)
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
q == C_NULL ? 0 : Int(q-p+1)
end
function rsearch(a::ByteArray, b::Char, i::Integer = length(a))
if isascii(b)
rsearch(a,UInt8(b),i)
else
rsearch(a,Vector{UInt8}(string(b)),i).start
end
end
## optimized concatenation, reverse, repeat ##
function string(a::String...)
if length(a) == 1
return a[1]::String
end
n = 0
for str in a
n += str.len
end
out = _string_n(n)
offs = 1
for str in a
unsafe_copy!(pointer(out,offs), pointer(str), str.len)
offs += str.len
end
return out
end
# UTF-8 encoding length of a character
function codelen(d::Char)
c = UInt32(d)
if c < 0x80
return 1
elseif c < 0x800
return 2
elseif c < 0x10000
return 3
elseif c < 0x110000
return 4
end
return 3 # '\ufffd'
end
function string(a::Union{String,Char}...)
n = 0
for d in a
if isa(d,Char)
n += codelen(d::Char)
else
n += (d::String).len
end
end
out = _string_n(n)
offs = 1
p = pointer(out)
for d in a
if isa(d,Char)
c = UInt32(d::Char)
if c < 0x80
unsafe_store!(p, c%UInt8, offs); offs += 1
elseif c < 0x800
unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
elseif c < 0x10000
unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
elseif c < 0x110000
unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1
unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
else
# '\ufffd'
unsafe_store!(p, 0xef, offs); offs += 1
unsafe_store!(p, 0xbf, offs); offs += 1
unsafe_store!(p, 0xbd, offs); offs += 1
end
else
l = (d::String).len
unsafe_copy!(pointer(out,offs), pointer(d::String), l)
offs += l
end
end
return out
end
function reverse(s::String)
dat = convert(Vector{UInt8},s)
n = length(dat)
n <= 1 && return s
buf = StringVector(n)
out = n
pos = 1
@inbounds while out > 0
ch = dat[pos]
if ch > 0xdf
if ch < 0xf0
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
pos += 3
else
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
pos += 4
end
elseif ch > 0x7f
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
pos += 2
else
buf[out] = ch
out -= 1
pos += 1
end
end
String(buf)
end
function repeat(s::String, r::Integer)
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
n = s.len
out = _string_n(n*r)
if n == 1 # common case: repeating a single ASCII char
ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, unsafe_load(pointer(s)), r)
else
for i=1:r
unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n)
end
end
return out
end

View File

@@ -0,0 +1,10 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
include("strings/errors.jl")
include("strings/types.jl")
include("strings/basic.jl")
include("strings/search.jl")
include("strings/util.jl")
include("strings/io.jl")
include("strings/utf8proc.jl")
importall .UTF8proc

View File

@@ -0,0 +1,157 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
# SubString and RevString types
## substrings reference original strings ##
struct SubString{T<:AbstractString} <: AbstractString
string::T
offset::Int
endof::Int
function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString
if i > endof(s) || j<i
return new(s, i-1, 0)
else
if !isvalid(s,i)
throw(ArgumentError("invalid SubString index"))
end
while !isvalid(s,j) && j > i
j -= 1
end
o = i-1
new(s, o, max(0, j-o))
end
end
end
SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j)
SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j)
SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j))
SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s))
sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1
# TODO: length(s::SubString) = ??
# default implementation will work but it's slow
# can this be delegated efficiently somehow?
# that may require additional string interfaces
length(s::SubString{<:DirectIndexString}) = endof(s)
function length(s::SubString{String})
return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
pointer(s), nextind(s, s.endof) - 1))
end
function next(s::SubString, i::Int)
if i < 1 || i > s.endof
throw(BoundsError(s, i))
end
c, i = next(s.string, i+s.offset)
c, i-s.offset
end
function getindex(s::SubString, i::Int)
if i < 1 || i > s.endof
throw(BoundsError(s, i))
end
getindex(s.string, i+s.offset)
end
endof(s::SubString) = s.endof
function isvalid(s::SubString, i::Integer)
return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i)
end
isvalid(s::SubString{<:DirectIndexString}, i::Integer) = (start(s) <= i <= endof(s))
ind2chr(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end
chr2ind(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end
nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset
prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
convert(::Type{SubString{T}}, s::T) where {T<:AbstractString} = SubString(s, 1, endof(s))
String(p::SubString{String}) =
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
function getindex(s::AbstractString, r::UnitRange{Int})
checkbounds(s, r) || throw(BoundsError(s, r))
SubString(s, first(r), last(r))
end
function cmp(a::SubString{String}, b::SubString{String})
na = sizeof(a)
nb = sizeof(b)
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
pointer(a), pointer(b), min(na,nb))
c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb)
end
# don't make unnecessary copies when passing substrings to C functions
cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s
cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s
function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8}
convert(Ptr{R}, pointer(s.string)) + s.offset
end
## reversed strings without data movement ##
struct RevString{T<:AbstractString} <: AbstractString
string::T
end
endof(s::RevString) = endof(s.string)
length(s::RevString) = length(s.string)
sizeof(s::RevString) = sizeof(s.string)
function next(s::RevString, i::Int)
n = endof(s); j = n-i+1
(s.string[j], n-prevind(s.string,j)+1)
end
"""
reverse(s::AbstractString) -> AbstractString
Reverses a string.
```jldoctest
julia> reverse("JuliaLang")
"gnaLailuJ"
```
"""
reverse(s::AbstractString) = RevString(s)
reverse(s::RevString) = s.string
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i
reverseind(s::RevString, i::Integer) = endof(s) - i + 1
reverseind(s::SubString{String}, i::Integer) =
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
function repeat(s::AbstractString, r::Integer)
r < 0 ? throw(ArgumentError("can't repeat a string $r times")) :
r == 0 ? "" :
r == 1 ? s :
repeat(convert(String, s), r)
end
"""
^(s::AbstractString, n::Integer)
Repeat `n` times the string `s`.
The [`repeat`](@ref) function is an alias to this operator.
```jldoctest
julia> "Test "^3
"Test Test Test "
```
"""
(^)(s::AbstractString, r::Integer) = repeat(s,r)
pointer(x::SubString{String}) = pointer(x.string) + x.offset
pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)

View File

@@ -0,0 +1,398 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
# Various Unicode functionality from the utf8proc library
module UTF8proc
import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase, titlecase
export isgraphemebreak, category_code, category_abbrev, category_string
# also exported by Base:
export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
isvalid(ch::Char) = isvalid(Char, ch)
# utf8 category constants
const UTF8PROC_CATEGORY_CN = 0
const UTF8PROC_CATEGORY_LU = 1
const UTF8PROC_CATEGORY_LL = 2
const UTF8PROC_CATEGORY_LT = 3
const UTF8PROC_CATEGORY_LM = 4
const UTF8PROC_CATEGORY_LO = 5
const UTF8PROC_CATEGORY_MN = 6
const UTF8PROC_CATEGORY_MC = 7
const UTF8PROC_CATEGORY_ME = 8
const UTF8PROC_CATEGORY_ND = 9
const UTF8PROC_CATEGORY_NL = 10
const UTF8PROC_CATEGORY_NO = 11
const UTF8PROC_CATEGORY_PC = 12
const UTF8PROC_CATEGORY_PD = 13
const UTF8PROC_CATEGORY_PS = 14
const UTF8PROC_CATEGORY_PE = 15
const UTF8PROC_CATEGORY_PI = 16
const UTF8PROC_CATEGORY_PF = 17
const UTF8PROC_CATEGORY_PO = 18
const UTF8PROC_CATEGORY_SM = 19
const UTF8PROC_CATEGORY_SC = 20
const UTF8PROC_CATEGORY_SK = 21
const UTF8PROC_CATEGORY_SO = 22
const UTF8PROC_CATEGORY_ZS = 23
const UTF8PROC_CATEGORY_ZL = 24
const UTF8PROC_CATEGORY_ZP = 25
const UTF8PROC_CATEGORY_CC = 26
const UTF8PROC_CATEGORY_CF = 27
const UTF8PROC_CATEGORY_CS = 28
const UTF8PROC_CATEGORY_CO = 29
# strings corresponding to the category constants
const category_strings = [
"Other, not assigned",
"Letter, uppercase",
"Letter, lowercase",
"Letter, titlecase",
"Letter, modifier",
"Letter, other",
"Mark, nonspacing",
"Mark, spacing combining",
"Mark, enclosing",
"Number, decimal digit",
"Number, letter",
"Number, other",
"Punctuation, connector",
"Punctuation, dash",
"Punctuation, open",
"Punctuation, close",
"Punctuation, initial quote",
"Punctuation, final quote",
"Punctuation, other",
"Symbol, math",
"Symbol, currency",
"Symbol, modifier",
"Symbol, other",
"Separator, space",
"Separator, line",
"Separator, paragraph",
"Other, control",
"Other, format",
"Other, surrogate",
"Other, private use"
]
const UTF8PROC_STABLE = (1<<1)
const UTF8PROC_COMPAT = (1<<2)
const UTF8PROC_COMPOSE = (1<<3)
const UTF8PROC_DECOMPOSE = (1<<4)
const UTF8PROC_IGNORE = (1<<5)
const UTF8PROC_REJECTNA = (1<<6)
const UTF8PROC_NLF2LS = (1<<7)
const UTF8PROC_NLF2PS = (1<<8)
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
const UTF8PROC_STRIPCC = (1<<9)
const UTF8PROC_CASEFOLD = (1<<10)
const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)
############################################################################
utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))
function utf8proc_map(str::String, options::Integer)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), C_NULL, 0, options)
nwords < 0 && utf8proc_error(nwords)
buffer = Base.StringVector(nwords*4)
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
str, sizeof(str), buffer, nwords, options)
nwords < 0 && utf8proc_error(nwords)
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
nbytes < 0 && utf8proc_error(nbytes)
return String(resize!(buffer, nbytes))
end
utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
compat && (flags = flags | UTF8PROC_COMPAT)
if decompose
flags = flags | UTF8PROC_DECOMPOSE
elseif compose
flags = flags | UTF8PROC_COMPOSE
elseif compat || stripmark
throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
end
stripignore && (flags = flags | UTF8PROC_IGNORE)
rejectna && (flags = flags | UTF8PROC_REJECTNA)
newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
newline2lf && (flags = flags | UTF8PROC_NLF2LF)
stripcc && (flags = flags | UTF8PROC_STRIPCC)
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
end
"""
normalize_string(s::AbstractString, normalform::Symbol)
Normalize the string `s` according to one of the four "normal forms" of the Unicode
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
(canonical composition) and D (canonical decomposition) convert different visually identical
representations of the same abstract string into a single canonical form, with form C being
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
they convert characters that are abstractly similar but visually distinct into a single
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
being more compact.
Alternatively, finer control and additional transformations may be be obtained by calling
`normalize_string(s; keywords...)`, where any number of the following boolean keywords
options (which all default to `false` except for `compose`) are specified:
* `compose=false`: do not perform canonical composition
* `decompose=true`: do canonical decomposition instead of canonical composition
(`compose=true` is ignored if present)
* `compat=true`: compatibility equivalents are canonicalized
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
paragraph-separation (PS) character, respectively
* `stripmark=true`: strip diacritical marks (e.g. accents)
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
or the left-to-right marker)
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
spaces; newlines are also converted to spaces unless a newline-conversion flag was
specified
* `rejectna=true`: throw an error if unassigned code points are found
* `stable=true`: enforce Unicode Versioning Stability
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
"""
function normalize_string(s::AbstractString, nf::Symbol)
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
| UTF8PROC_COMPAT) :
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
| UTF8PROC_COMPAT) :
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end
############################################################################
"""
charwidth(c)
Gives the number of columns needed to print a character.
"""
charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
############################################################################
# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c)
# more human-readable representations of the category code
category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
category_string(c) = category_strings[category_code(c)+1]
"""
is_assigned_char(c) -> Bool
Returns `true` if the given char or integer is an assigned Unicode code point.
"""
is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
## libc character class predicates ##
"""
islower(c::Char) -> Bool
Tests whether a character is a lowercase letter.
A character is classified as lowercase if it belongs to Unicode category Ll,
Letter: Lowercase.
"""
islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
# true for Unicode upper and mixed case
"""
isupper(c::Char) -> Bool
Tests whether a character is an uppercase letter.
A character is classified as uppercase if it belongs to Unicode category Lu,
Letter: Uppercase, or Lt, Letter: Titlecase.
"""
function isupper(c::Char)
ccode = category_code(c)
return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT
end
"""
isdigit(c::Char) -> Bool
Tests whether a character is a numeric digit (0-9).
"""
isdigit(c::Char) = ('0' <= c <= '9')
"""
isalpha(c::Char) -> Bool
Tests whether a character is alphabetic.
A character is classified as alphabetic if it belongs to the Unicode general
category Letter, i.e. a character whose category code begins with 'L'.
"""
isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO)
"""
isnumber(c::Char) -> Bool
Tests whether a character is numeric.
A character is classified as numeric if it belongs to the Unicode general category Number,
i.e. a character whose category code begins with 'N'.
"""
isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO)
"""
isalnum(c::Char) -> Bool
Tests whether a character is alphanumeric.
A character is classified as alphabetic if it belongs to the Unicode general
category Letter or Number, i.e. a character whose category code begins with 'L' or 'N'.
"""
function isalnum(c::Char)
ccode = category_code(c)
return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
(UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
end
# following C++ only control characters from the Latin-1 subset return true
"""
iscntrl(c::Char) -> Bool
Tests whether a character is a control character.
Control characters are the non-printing characters of the Latin-1 subset of Unicode.
"""
iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
"""
ispunct(c::Char) -> Bool
Tests whether a character belongs to the Unicode general category Punctuation, i.e. a
character whose category code begins with 'P'.
"""
ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO)
# \u85 is the Unicode Next Line (NEL) character
"""
isspace(c::Char) -> Bool
Tests whether a character is any whitespace character. Includes ASCII characters '\\t',
'\\n', '\\v', '\\f', '\\r', and ' ', Latin-1 character U+0085, and characters in Unicode
category Zs.
"""
@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
"""
isprint(c::Char) -> Bool
Tests whether a character is printable, including spaces, but not a control character.
"""
isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS)
# true in principal if a printer would use ink
"""
isgraph(c::Char) -> Bool
Tests whether a character is printable, and not a space.
Any character that would cause a printer to use ink should be
classified with `isgraph(c)==true`.
"""
isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO)
############################################################################
# iterators for grapheme segmentation
isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
# Stateful grapheme break required by Unicode-9 rules: the string
# must be processed in sequence, with state initialized to Ref{Int32}(0).
# Requires utf8proc v2.0 or later.
isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state)
struct GraphemeIterator{S<:AbstractString}
s::S # original string (for generation of SubStrings)
end
"""
graphemes(s::AbstractString) -> GraphemeIterator
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
single characters, even though they may contain more than one codepoint; for example a
letter combined with an accent mark is a single grapheme.)
"""
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
function length(g::GraphemeIterator)
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
n = 0
state = Ref{Int32}(0)
for c in g.s
n += isgraphemebreak!(state, c0, c)
c0 = c
end
return n
end
start(g::GraphemeIterator) = (start(g.s), Ref{Int32}(0))
done(g::GraphemeIterator, i) = done(g.s, i[1])
function next(g::GraphemeIterator, i_)
s = g.s
i, state = i_
j = i
c0, k = next(s, i)
while !done(s, k) # loop until next grapheme is s[i:j]
c, = next(s, k)
isgraphemebreak!(state, c0, c) && break
j = k
k =
c0 = c
end
return (SubString(s, i, j), (k, state))
end
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
convert(::Type{S}, g::GraphemeIterator) where {S<:AbstractString} = convert(S, g.s)
show(io::IO, g::GraphemeIterator{S}) where {S} = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
############################################################################
end # module

View File

@@ -0,0 +1,500 @@
# This file is a part of Julia. License is MIT: https://julialang.org/license
# starts with and ends with predicates
"""
startswith(s::AbstractString, prefix::AbstractString)
Returns `true` if `s` starts with `prefix`. If `prefix` is a vector or set
of characters, tests whether the first character of `s` belongs to that set.
See also [`endswith`](@ref).
```jldoctest
julia> startswith("JuliaLang", "Julia")
true
```
"""
function startswith(a::AbstractString, b::AbstractString)
i = start(a)
j = start(b)
while !done(a,i) && !done(b,i)
c, i = next(a,i)
d, j = next(b,j)
(c != d) && (return false)
end
done(b,i)
end
startswith(str::AbstractString, chars::Chars) = !isempty(str) && first(str) in chars
"""
endswith(s::AbstractString, suffix::AbstractString)
Returns `true` if `s` ends with `suffix`. If `suffix` is a vector or set of
characters, tests whether the last character of `s` belongs to that set.
See also [`startswith`](@ref).
```jldoctest
julia> endswith("Sunday", "day")
true
```
"""
function endswith(a::AbstractString, b::AbstractString)
i = endof(a)
j = endof(b)
a1 = start(a)
b1 = start(b)
while a1 <= i && b1 <= j
c = a[i]
d = b[j]
(c != d) && (return false)
i = prevind(a,i)
j = prevind(b,j)
end
j < b1
end
endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars
startswith(a::String, b::String) =
(a.len >= b.len && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, b.len) == 0)
startswith(a::Vector{UInt8}, b::Vector{UInt8}) =
(length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0)
# TODO: fast endswith
"""
chop(s::AbstractString)
Remove the last character from `s`.
```jldoctest
julia> a = "March"
"March"
julia> chop(a)
"Marc"
```
"""
chop(s::AbstractString) = SubString(s, 1, endof(s)-1)
"""
chomp(s::AbstractString)
Remove a single trailing newline from a string.
```jldoctest
julia> chomp("Hello\\n")
"Hello"
```
"""
function chomp(s::AbstractString)
i = endof(s)
(i < 1 || s[i] != '\n') && (return SubString(s, 1, i))
j = prevind(s,i)
(j < 1 || s[j] != '\r') && (return SubString(s, 1, i-1))
return SubString(s, 1, j-1)
end
function chomp(s::String)
i = endof(s)
if i < 1 || codeunit(s,i) != 0x0a
SubString(s, 1, i)
elseif i < 2 || codeunit(s,i-1) != 0x0d
SubString(s, 1, i-1)
else
SubString(s, 1, i-2)
end
end
# NOTE: use with caution -- breaks the immutable string convention!
# TODO: this is hard to provide with the new representation
#function chomp!(s::String)
# if !isempty(s) && codeunit(s,s.len) == 0x0a
# n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2
# ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n)
# end
# return s
#end
chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types
const _default_delims = [' ','\t','\n','\v','\f','\r']
"""
lstrip(s::AbstractString[, chars::Chars])
Return `s` with any leading whitespace and delimiters removed.
The default delimiters to remove are `' '`, `\\t`, `\\n`, `\\v`,
`\\f`, and `\\r`.
If `chars` (a character, or vector or set of characters) is provided,
instead remove characters contained in it.
```jldoctest
julia> a = lpad("March", 20)
" March"
julia> lstrip(a)
"March"
```
"""
function lstrip(s::AbstractString, chars::Chars=_default_delims)
i = start(s)
while !done(s,i)
c, j = next(s,i)
if !(c in chars)
return s[i:end]
end
i = j
end
s[end+1:end]
end
"""
rstrip(s::AbstractString[, chars::Chars])
Return `s` with any trailing whitespace and delimiters removed.
The default delimiters to remove are `' '`, `\\t`, `\\n`, `\\v`,
`\\f`, and `\\r`.
If `chars` (a character, or vector or set of characters) is provided,
instead remove characters contained in it.
```jldoctest
julia> a = rpad("March", 20)
"March "
julia> rstrip(a)
"March"
```
"""
function rstrip(s::AbstractString, chars::Chars=_default_delims)
r = RevString(s)
i = start(r)
while !done(r,i)
c, j = next(r,i)
if !(c in chars)
return s[1:end-i+1]
end
i = j
end
s[1:0]
end
"""
strip(s::AbstractString, [chars::Chars])
Return `s` with any leading and trailing whitespace removed.
If `chars` (a character, or vector or set of characters) is provided,
instead remove characters contained in it.
```jldoctest
julia> strip("{3, 5}\\n", ['{', '}', '\\n'])
"3, 5"
```
"""
strip(s::AbstractString) = lstrip(rstrip(s))
strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars)
## string padding functions ##
function lpad(s::AbstractString, n::Integer, p::AbstractString=" ")
m = n - strwidth(s)
(m <= 0) && (return s)
l = strwidth(p)
if l==1
return string(p^m, s)
end
q = div(m,l)
r = m - q*l
i = r != 0 ? chr2ind(p, r) : -1
string(p^q, p[1:i], s)
end
function rpad(s::AbstractString, n::Integer, p::AbstractString=" ")
m = n - strwidth(s)
(m <= 0) && (return s)
l = strwidth(p)
if l==1
return string(s, p^m)
end
q = div(m,l)
r = m - q*l
i = r != 0 ? chr2ind(p, r) : -1
string(s, p^q, p[1:i])
end
"""
lpad(s, n::Integer, p::AbstractString=" ")
Make a string at least `n` columns wide when printed by padding `s` on the left
with copies of `p`.
```jldoctest
julia> lpad("March",10)
" March"
```
"""
lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p))
"""
rpad(s, n::Integer, p::AbstractString=" ")
Make a string at least `n` columns wide when printed by padding `s` on the right
with copies of `p`.
```jldoctest
julia> rpad("March",20)
"March "
```
"""
rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p))
cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p)
# splitter can be a Char, Vector{Char}, AbstractString, Regex, ...
# any splitter that provides search(s::AbstractString, splitter)
split(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:SubString} =
_split(str, splitter, limit, keep, T[])
"""
split(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true)
Return an array of substrings by splitting the given string on occurrences of the given
character delimiters, which may be specified in any of the formats allowed by `search`'s
second argument (i.e. a single character, collection of characters, string, or regular
expression). If `chars` is omitted, it defaults to the set of all space characters, and
`keep` is taken to be `false`. The two keyword arguments are optional: they are a
maximum size for the result and a flag determining whether empty fields should be kept in
the result.
```jldoctest
julia> a = "Ma.rch"
"Ma.rch"
julia> split(a,".")
2-element Array{SubString{String},1}:
"Ma"
"rch"
```
"""
split(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:AbstractString} =
_split(str, splitter, limit, keep, SubString{T}[])
function _split(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
i = start(str)
n = endof(str)
r = search(str,splitter,i)
j, k = first(r), nextind(str,last(r))
while 0 < j <= n && length(strs) != limit-1
if i < k
if keep_empty || i < j
push!(strs, SubString(str,i,prevind(str,j)))
end
i = k
end
(k <= j) && (k = nextind(str,j))
r = search(str,splitter,k)
j, k = first(r), nextind(str,last(r))
end
if keep_empty || !done(str,i)
push!(strs, SubString(str,i))
end
return strs
end
# a bit oddball, but standard behavior in Perl, Ruby & Python:
split(str::AbstractString) = split(str, _default_delims; limit=0, keep=false)
rsplit(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:SubString} =
_rsplit(str, splitter, limit, keep, T[])
"""
rsplit(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true)
Similar to [`split`](@ref), but starting from the end of the string.
```jldoctest
julia> a = "M.a.r.c.h"
"M.a.r.c.h"
julia> rsplit(a,".")
5-element Array{SubString{String},1}:
"M"
"a"
"r"
"c"
"h"
julia> rsplit(a,".";limit=1)
1-element Array{SubString{String},1}:
"M.a.r.c.h"
julia> rsplit(a,".";limit=2)
2-element Array{SubString{String},1}:
"M.a.r.c"
"h"
```
"""
rsplit(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:AbstractString} =
_rsplit(str, splitter, limit, keep, SubString{T}[])
function _rsplit(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
i = start(str)
n = endof(str)
r = rsearch(str,splitter)
j = first(r)-1
k = last(r)
while((0 <= j < n) && (length(strs) != limit-1))
if i <= k
(keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n))
n = j
end
(k <= j) && (j = prevind(str,j))
r = rsearch(str,splitter,j)
j = first(r)-1
k = last(r)
end
(keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n))
return strs
end
#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false)
_replace(io, repl, str, r, pattern) = print(io, repl)
_replace(io, repl::Function, str, r, pattern) =
print(io, repl(SubString(str, first(r), last(r))))
function replace(str::String, pattern, repl, limit::Integer)
n = 1
e = endof(str)
i = a = start(str)
r = search(str,pattern,i)
j, k = first(r), last(r)
out = IOBuffer(StringVector(floor(Int, 1.2sizeof(str))), true, true)
out.size = 0
out.ptr = 1
while j != 0
if i == a || i <= k
unsafe_write(out, pointer(str, i), UInt(j-i))
_replace(out, repl, str, r, pattern)
end
if k<j
i = j
k = nextind(str, j)
else
i = k = nextind(str, k)
end
if j > e
break
end
r = search(str,pattern,k)
j, k = first(r), last(r)
n == limit && break
n += 1
end
write(out, SubString(str,i))
String(take!(out))
end
"""
replace(string::AbstractString, pat, r[, n::Integer=0])
Search for the given pattern `pat`, and replace each occurrence with `r`. If `n` is
provided, replace at most `n` occurrences. As with search, the second argument may be a
single character, a vector or a set of characters, a string, or a regular expression. If `r`
is a function, each occurrence is replaced with `r(s)` where `s` is the matched substring.
If `pat` is a regular expression and `r` is a `SubstitutionString`, then capture group
references in `r` are replaced with the corresponding matched text.
"""
replace(s::AbstractString, pat, f, n::Integer) = replace(String(s), pat, f, n)
replace(s::AbstractString, pat, r) = replace(s, pat, r, 0)
# hex <-> bytes conversion
"""
hex2bytes(s::AbstractString)
Convert an arbitrarily long hexadecimal string to its binary representation. Returns an
`Array{UInt8,1}`, i.e. an array of bytes.
```jldoctest
julia> a = hex(12345)
"3039"
julia> hex2bytes(a)
2-element Array{UInt8,1}:
0x30
0x39
```
"""
function hex2bytes(s::AbstractString)
a = zeros(UInt8, div(endof(s), 2))
i, j = start(s), 0
while !done(s, i)
c, i = next(s, i)
n = '0' <= c <= '9' ? c - '0' :
'a' <= c <= 'f' ? c - 'a' + 10 :
'A' <= c <= 'F' ? c - 'A' + 10 :
throw(ArgumentError("not a hexadecimal string: $(repr(s))"))
done(s, i) &&
throw(ArgumentError("string length must be even: length($(repr(s))) == $(length(s))"))
c, i = next(s, i)
n = '0' <= c <= '9' ? n << 4 + c - '0' :
'a' <= c <= 'f' ? n << 4 + c - 'a' + 10 :
'A' <= c <= 'F' ? n << 4 + c - 'A' + 10 :
throw(ArgumentError("not a hexadecimal string: $(repr(s))"))
a[j += 1] = n
end
resize!(a, j)
return a
end
"""
bytes2hex(bin_arr::Array{UInt8, 1}) -> String
Convert an array of bytes to its hexadecimal representation.
All characters are in lower-case.
```jldoctest
julia> a = hex(12345)
"3039"
julia> b = hex2bytes(a)
2-element Array{UInt8,1}:
0x30
0x39
julia> bytes2hex(b)
"3039"
```
"""
function bytes2hex(a::AbstractArray{UInt8})
b = Vector{UInt8}(2*length(a))
i = 0
for x in a
b[i += 1] = hex_chars[1 + x >> 4]
b[i += 1] = hex_chars[1 + x & 0xf]
end
return String(b)
end
# check for pure ASCII-ness
function ascii(s::String)
for (i, b) in enumerate(Vector{UInt8}(s))
b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))"))
end
return s
end
"""
ascii(s::AbstractString)
Convert a string to `String` type and check that it contains only ASCII data, otherwise
throwing an `ArgumentError` indicating the position of the first non-ASCII byte.
```jldoctest
julia> ascii("abcdeγfgh")
ERROR: ArgumentError: invalid ASCII at index 6 in "abcdeγfgh"
Stacktrace:
[1] ascii(::String) at ./strings/util.jl:479
julia> ascii("abcdefgh")
"abcdefgh"
```
"""
ascii(x::AbstractString) = ascii(convert(String, x))