Add: julia-0.6.2
Former-commit-id: ccc667cf67d569f3fb3df39aa57c2134755a7551
This commit is contained in:
500
julia-0.6.2/share/julia/base/strings/basic.jl
Normal file
500
julia-0.6.2/share/julia/base/strings/basic.jl
Normal file
@@ -0,0 +1,500 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
## core string functions ##
|
||||
|
||||
endof(s::AbstractString) = error("you must implement endof(", typeof(s), ")")
|
||||
next(s::AbstractString, i::Int) = error("you must implement next(", typeof(s), ",Int)")
|
||||
next(s::DirectIndexString, i::Int) = (s[i],i+1)
|
||||
next(s::AbstractString, i::Integer) = next(s,Int(i))
|
||||
|
||||
string() = ""
|
||||
string(s::AbstractString) = s
|
||||
|
||||
"""
|
||||
String(s::AbstractString)
|
||||
|
||||
Convert a string to a contiguous byte array representation encoded as UTF-8 bytes.
|
||||
This representation is often appropriate for passing strings to C.
|
||||
"""
|
||||
String(s::AbstractString) = print_to_string(s)
|
||||
|
||||
convert(::Type{Vector{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, String(s))
|
||||
convert(::Type{Array{UInt8}}, s::AbstractString) = convert(Vector{UInt8}, s)
|
||||
convert(::Type{String}, s::AbstractString) = String(s)
|
||||
convert(::Type{Vector{Char}}, s::AbstractString) = collect(s)
|
||||
convert(::Type{Symbol}, s::AbstractString) = Symbol(s)
|
||||
convert(::Type{String}, s::Symbol) = unsafe_string(Cstring(s))
|
||||
|
||||
## generic supplied functions ##
|
||||
|
||||
start(s::AbstractString) = 1
|
||||
done(s::AbstractString,i) = (i > endof(s))
|
||||
getindex(s::AbstractString, i::Int) = next(s,i)[1]
|
||||
getindex(s::AbstractString, i::Integer) = s[Int(i)]
|
||||
getindex(s::AbstractString, i::Colon) = s
|
||||
getindex(s::AbstractString, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))]
|
||||
# TODO: handle other ranges with stride ±1 specially?
|
||||
getindex(s::AbstractString, v::AbstractVector{<:Integer}) =
|
||||
sprint(length(v), io->(for i in v; write(io,s[i]) end))
|
||||
getindex(s::AbstractString, v::AbstractVector{Bool}) =
|
||||
throw(ArgumentError("logical indexing not supported for strings"))
|
||||
|
||||
Symbol(s::AbstractString) = Symbol(String(s))
|
||||
|
||||
"""
|
||||
sizeof(s::AbstractString)
|
||||
|
||||
The number of bytes in string `s`.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> sizeof("❤")
|
||||
3
|
||||
```
|
||||
"""
|
||||
sizeof(s::AbstractString) = error("type $(typeof(s)) has no canonical binary representation")
|
||||
|
||||
eltype(::Type{<:AbstractString}) = Char
|
||||
|
||||
"""
|
||||
```
|
||||
*(s::AbstractString, t::AbstractString)
|
||||
```
|
||||
|
||||
Concatenate strings. The `*` operator is an alias to this function.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> "Hello " * "world"
|
||||
"Hello world"
|
||||
```
|
||||
"""
|
||||
(*)(s1::AbstractString, ss::AbstractString...) = string(s1, ss...)
|
||||
|
||||
one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
|
||||
|
||||
length(s::DirectIndexString) = endof(s)
|
||||
|
||||
"""
|
||||
length(s::AbstractString)
|
||||
|
||||
The number of characters in string `s`.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> length("jμΛIα")
|
||||
5
|
||||
```
|
||||
"""
|
||||
function length(s::AbstractString)
|
||||
i = start(s)
|
||||
if done(s,i)
|
||||
return 0
|
||||
end
|
||||
n = 1
|
||||
while true
|
||||
c, j = next(s,i)
|
||||
if done(s,j)
|
||||
return n
|
||||
end
|
||||
n += 1
|
||||
i = j
|
||||
end
|
||||
end
|
||||
|
||||
## string comparison functions ##
|
||||
|
||||
function cmp(a::AbstractString, b::AbstractString)
|
||||
if a === b
|
||||
return 0
|
||||
end
|
||||
i = start(a)
|
||||
j = start(b)
|
||||
while !done(a,i)
|
||||
if done(b,j)
|
||||
return +1
|
||||
end
|
||||
c, i = next(a,i)
|
||||
d, j = next(b,j)
|
||||
if c != d
|
||||
return c < d ? -1 : +1
|
||||
end
|
||||
end
|
||||
done(b,j) ? 0 : -1
|
||||
end
|
||||
|
||||
==(a::AbstractString, b::AbstractString) = cmp(a,b) == 0
|
||||
isless(a::AbstractString, b::AbstractString) = cmp(a,b) < 0
|
||||
|
||||
# faster comparisons for symbols
|
||||
|
||||
cmp(a::Symbol, b::Symbol) = Int(sign(ccall(:strcmp, Int32, (Cstring, Cstring), a, b)))
|
||||
|
||||
isless(a::Symbol, b::Symbol) = cmp(a,b) < 0
|
||||
|
||||
## Generic validation functions ##
|
||||
|
||||
isvalid(s::DirectIndexString, i::Integer) = (start(s) <= i <= endof(s))
|
||||
|
||||
"""
|
||||
isvalid(str::AbstractString, i::Integer)
|
||||
|
||||
Tells whether index `i` is valid for the given string.
|
||||
|
||||
# Examples
|
||||
|
||||
```jldoctest
|
||||
julia> str = "αβγdef";
|
||||
|
||||
julia> isvalid(str, 1)
|
||||
true
|
||||
|
||||
julia> str[1]
|
||||
'α': Unicode U+03b1 (category Ll: Letter, lowercase)
|
||||
|
||||
julia> isvalid(str, 2)
|
||||
false
|
||||
|
||||
julia> str[2]
|
||||
ERROR: UnicodeError: invalid character index
|
||||
[...]
|
||||
```
|
||||
"""
|
||||
function isvalid(s::AbstractString, i::Integer)
|
||||
i < 1 && return false
|
||||
done(s,i) && return false
|
||||
try
|
||||
next(s,i)
|
||||
true
|
||||
catch
|
||||
false
|
||||
end
|
||||
end
|
||||
|
||||
## Generic indexing functions ##
|
||||
|
||||
prevind(s::DirectIndexString, i::Integer) = Int(i)-1
|
||||
prevind(s::AbstractArray , i::Integer) = Int(i)-1
|
||||
nextind(s::DirectIndexString, i::Integer) = Int(i)+1
|
||||
nextind(s::AbstractArray , i::Integer) = Int(i)+1
|
||||
|
||||
"""
|
||||
prevind(str::AbstractString, i::Integer)
|
||||
|
||||
Get the previous valid string index before `i`.
|
||||
Returns a value less than `1` at the beginning of the string.
|
||||
|
||||
# Examples
|
||||
|
||||
```jldoctest
|
||||
julia> prevind("αβγdef", 3)
|
||||
1
|
||||
|
||||
julia> prevind("αβγdef", 1)
|
||||
0
|
||||
```
|
||||
"""
|
||||
function prevind(s::AbstractString, i::Integer)
|
||||
e = endof(s)
|
||||
if i > e
|
||||
return e
|
||||
end
|
||||
j = Int(i)-1
|
||||
while j >= 1
|
||||
if isvalid(s,j)
|
||||
return j
|
||||
end
|
||||
j -= 1
|
||||
end
|
||||
return 0 # out of range
|
||||
end
|
||||
|
||||
"""
|
||||
nextind(str::AbstractString, i::Integer)
|
||||
|
||||
Get the next valid string index after `i`.
|
||||
Returns a value greater than `endof(str)` at or after the end of the string.
|
||||
|
||||
# Examples
|
||||
|
||||
```jldoctest
|
||||
julia> str = "αβγdef";
|
||||
|
||||
julia> nextind(str, 1)
|
||||
3
|
||||
|
||||
julia> endof(str)
|
||||
9
|
||||
|
||||
julia> nextind(str, 9)
|
||||
10
|
||||
```
|
||||
"""
|
||||
function nextind(s::AbstractString, i::Integer)
|
||||
e = endof(s)
|
||||
if i < 1
|
||||
return 1
|
||||
end
|
||||
if i > e
|
||||
return Int(i)+1
|
||||
end
|
||||
for j = Int(i)+1:e
|
||||
if isvalid(s,j)
|
||||
return j
|
||||
end
|
||||
end
|
||||
next(s,e)[2] # out of range
|
||||
end
|
||||
|
||||
checkbounds(s::AbstractString, i::Integer) = start(s) <= i <= endof(s) || throw(BoundsError(s, i))
|
||||
checkbounds(s::AbstractString, r::Range{<:Integer}) = isempty(r) || (minimum(r) >= start(s) && maximum(r) <= endof(s)) || throw(BoundsError(s, r))
|
||||
# The following will end up using a deprecated checkbounds, when the covariant parameter is not Integer
|
||||
checkbounds(s::AbstractString, I::AbstractArray{<:Real}) = all(i -> checkbounds(s, i), I)
|
||||
checkbounds(s::AbstractString, I::AbstractArray{<:Integer}) = all(i -> checkbounds(s, i), I)
|
||||
|
||||
ind2chr(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end
|
||||
chr2ind(s::DirectIndexString, i::Integer) = begin checkbounds(s,i); i end
|
||||
|
||||
|
||||
"""
|
||||
ind2chr(s::AbstractString, i::Integer)
|
||||
|
||||
Convert a byte index `i` to a character index with
|
||||
respect to string `s`.
|
||||
|
||||
See also [`chr2ind`](@ref).
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> str = "αβγdef";
|
||||
|
||||
julia> ind2chr(str, 3)
|
||||
2
|
||||
|
||||
julia> chr2ind(str, 2)
|
||||
3
|
||||
```
|
||||
"""
|
||||
function ind2chr(s::AbstractString, i::Integer)
|
||||
s[i] # throws error if invalid
|
||||
j = 1
|
||||
k = start(s)
|
||||
while true
|
||||
c, l = next(s,k)
|
||||
if i <= k
|
||||
return j
|
||||
end
|
||||
j += 1
|
||||
k = l
|
||||
end
|
||||
end
|
||||
|
||||
"""
|
||||
chr2ind(s::AbstractString, i::Integer)
|
||||
|
||||
Convert a character index `i` to a byte index.
|
||||
|
||||
See also [`ind2chr`](@ref).
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> str = "αβγdef";
|
||||
|
||||
julia> chr2ind(str, 2)
|
||||
3
|
||||
|
||||
julia> ind2chr(str, 3)
|
||||
2
|
||||
```
|
||||
"""
|
||||
function chr2ind(s::AbstractString, i::Integer)
|
||||
i < start(s) && throw(BoundsError(s, i))
|
||||
j = 1
|
||||
k = start(s)
|
||||
while true
|
||||
c, l = next(s,k)
|
||||
if i == j
|
||||
return k
|
||||
end
|
||||
j += 1
|
||||
k = l
|
||||
end
|
||||
end
|
||||
|
||||
struct EachStringIndex{T<:AbstractString}
|
||||
s::T
|
||||
end
|
||||
eachindex(s::AbstractString) = EachStringIndex(s)
|
||||
|
||||
length(e::EachStringIndex) = length(e.s)
|
||||
start(e::EachStringIndex) = start(e.s)
|
||||
next(e::EachStringIndex, state) = (state, nextind(e.s, state))
|
||||
done(e::EachStringIndex, state) = done(e.s, state)
|
||||
eltype(::Type{EachStringIndex}) = Int
|
||||
|
||||
## character column width function ##
|
||||
|
||||
"""
|
||||
strwidth(s::AbstractString)
|
||||
|
||||
Gives the number of columns needed to print a string.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> strwidth("March")
|
||||
5
|
||||
```
|
||||
"""
|
||||
strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
|
||||
|
||||
"""
|
||||
isascii(c::Union{Char,AbstractString}) -> Bool
|
||||
|
||||
Tests whether a character belongs to the ASCII character set, or whether this is true for
|
||||
all elements of a string.
|
||||
"""
|
||||
isascii(c::Char) = c < Char(0x80)
|
||||
isascii(s::AbstractString) = all(isascii, s)
|
||||
|
||||
## string promotion rules ##
|
||||
|
||||
promote_rule(::Type{<:AbstractString}, ::Type{<:AbstractString}) = String
|
||||
|
||||
"""
|
||||
isxdigit(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is a valid hexadecimal digit. Note that this does not
|
||||
include `x` (as in the standard `0x` prefix).
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> isxdigit('a')
|
||||
true
|
||||
|
||||
julia> isxdigit('x')
|
||||
false
|
||||
```
|
||||
"""
|
||||
isxdigit(c::Char) = '0'<=c<='9' || 'a'<=c<='f' || 'A'<=c<='F'
|
||||
|
||||
## uppercase, lowercase, and titlecase transformations ##
|
||||
|
||||
"""
|
||||
uppercase(s::AbstractString)
|
||||
|
||||
Returns `s` with all characters converted to uppercase.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> uppercase("Julia")
|
||||
"JULIA"
|
||||
```
|
||||
"""
|
||||
uppercase(s::AbstractString) = map(uppercase, s)
|
||||
|
||||
"""
|
||||
lowercase(s::AbstractString)
|
||||
|
||||
Returns `s` with all characters converted to lowercase.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> lowercase("STRINGS AND THINGS")
|
||||
"strings and things"
|
||||
```
|
||||
"""
|
||||
lowercase(s::AbstractString) = map(lowercase, s)
|
||||
|
||||
"""
|
||||
titlecase(s::AbstractString)
|
||||
|
||||
Capitalizes the first character of each word in `s`.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> titlecase("the julia programming language")
|
||||
"The Julia Programming Language"
|
||||
```
|
||||
"""
|
||||
function titlecase(s::AbstractString)
|
||||
startword = true
|
||||
b = IOBuffer()
|
||||
for c in s
|
||||
if isspace(c)
|
||||
print(b, c)
|
||||
startword = true
|
||||
else
|
||||
print(b, startword ? titlecase(c) : c)
|
||||
startword = false
|
||||
end
|
||||
end
|
||||
return String(take!(b))
|
||||
end
|
||||
|
||||
"""
|
||||
ucfirst(s::AbstractString)
|
||||
|
||||
Returns `string` with the first character converted to uppercase.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> ucfirst("python")
|
||||
"Python"
|
||||
```
|
||||
"""
|
||||
function ucfirst(s::AbstractString)
|
||||
isempty(s) || isupper(s[1]) ? s : string(uppercase(s[1]),s[nextind(s,1):end])
|
||||
end
|
||||
|
||||
"""
|
||||
lcfirst(s::AbstractString)
|
||||
|
||||
Returns `string` with the first character converted to lowercase.
|
||||
|
||||
# Example
|
||||
|
||||
```jldoctest
|
||||
julia> lcfirst("Julia")
|
||||
"julia"
|
||||
```
|
||||
"""
|
||||
function lcfirst(s::AbstractString)
|
||||
isempty(s) || islower(s[1]) ? s : string(lowercase(s[1]),s[nextind(s,1):end])
|
||||
end
|
||||
|
||||
## string map, filter, has ##
|
||||
|
||||
function map(f, s::AbstractString)
|
||||
out = IOBuffer(StringVector(endof(s)),true,true)
|
||||
truncate(out,0)
|
||||
for c in s
|
||||
c2 = f(c)
|
||||
if !isa(c2,Char)
|
||||
throw(ArgumentError("map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"))
|
||||
end
|
||||
write(out, c2::Char)
|
||||
end
|
||||
String(take!(out))
|
||||
end
|
||||
|
||||
function filter(f, s::AbstractString)
|
||||
out = IOBuffer(StringVector(endof(s)),true,true)
|
||||
truncate(out,0)
|
||||
for c in s
|
||||
if f(c)
|
||||
write(out, c)
|
||||
end
|
||||
end
|
||||
String(take!(out))
|
||||
end
|
||||
15
julia-0.6.2/share/julia/base/strings/errors.jl
Normal file
15
julia-0.6.2/share/julia/base/strings/errors.jl
Normal file
@@ -0,0 +1,15 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
## Error messages for Unicode / UTF support
|
||||
|
||||
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
|
||||
const UTF_ERR_INVALID_INDEX = "invalid character index <<1>> (0x<<2>> is a continuation byte)"
|
||||
|
||||
mutable struct UnicodeError <: Exception
|
||||
errmsg::AbstractString ##< A UTF_ERR_ message
|
||||
errpos::Int32 ##< Position of invalid character
|
||||
errchr::UInt32 ##< Invalid character
|
||||
end
|
||||
|
||||
show(io::IO, exc::UnicodeError) = print(io, replace(replace(string("UnicodeError: ",exc.errmsg),
|
||||
"<<1>>",string(exc.errpos)),"<<2>>",hex(exc.errchr)))
|
||||
451
julia-0.6.2/share/julia/base/strings/io.jl
Normal file
451
julia-0.6.2/share/julia/base/strings/io.jl
Normal file
@@ -0,0 +1,451 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
## core text I/O ##
|
||||
|
||||
|
||||
"""
|
||||
print(io::IO, x)
|
||||
|
||||
Write to `io` (or to the default output stream [`STDOUT`](@ref)
|
||||
if `io` is not given) a canonical (un-decorated) text representation
|
||||
of a value if there is one, otherwise call [`show`](@ref).
|
||||
The representation used by `print` includes minimal formatting and tries to
|
||||
avoid Julia-specific details.
|
||||
|
||||
```jldoctest
|
||||
julia> print("Hello World!")
|
||||
Hello World!
|
||||
julia> io = IOBuffer();
|
||||
|
||||
julia> print(io, "Hello World!")
|
||||
|
||||
julia> String(take!(io))
|
||||
"Hello World!"
|
||||
```
|
||||
"""
|
||||
function print(io::IO, x)
|
||||
lock(io)
|
||||
try
|
||||
show(io, x)
|
||||
finally
|
||||
unlock(io)
|
||||
end
|
||||
return nothing
|
||||
end
|
||||
|
||||
function print(io::IO, xs...)
|
||||
lock(io)
|
||||
try
|
||||
for x in xs
|
||||
print(io, x)
|
||||
end
|
||||
finally
|
||||
unlock(io)
|
||||
end
|
||||
return nothing
|
||||
end
|
||||
|
||||
"""
|
||||
println(io::IO, xs...)
|
||||
|
||||
Print (using [`print`](@ref)) `xs` followed by a newline.
|
||||
If `io` is not supplied, prints to [`STDOUT`](@ref).
|
||||
"""
|
||||
println(io::IO, xs...) = print(io, xs..., '\n')
|
||||
|
||||
## conversion of general objects to strings ##
|
||||
|
||||
function sprint(size::Integer, f::Function, args...; env=nothing)
|
||||
s = IOBuffer(StringVector(size), true, true)
|
||||
# specialized version of truncate(s,0)
|
||||
s.size = 0
|
||||
s.ptr = 1
|
||||
if env !== nothing
|
||||
f(IOContext(s, env), args...)
|
||||
else
|
||||
f(s, args...)
|
||||
end
|
||||
String(resize!(s.data, s.size))
|
||||
end
|
||||
|
||||
"""
|
||||
sprint(f::Function, args...)
|
||||
|
||||
Call the given function with an I/O stream and the supplied extra arguments.
|
||||
Everything written to this I/O stream is returned as a string.
|
||||
|
||||
```jldoctest
|
||||
julia> sprint(showcompact, 66.66666)
|
||||
"66.6667"
|
||||
```
|
||||
"""
|
||||
sprint(f::Function, args...) = sprint(0, f, args...)
|
||||
|
||||
tostr_sizehint(x) = 0
|
||||
tostr_sizehint(x::AbstractString) = endof(x)
|
||||
tostr_sizehint(x::Float64) = 20
|
||||
tostr_sizehint(x::Float32) = 12
|
||||
|
||||
function print_to_string(xs...; env=nothing)
|
||||
# specialized for performance reasons
|
||||
s = IOBuffer(StringVector(tostr_sizehint(xs[1])), true, true)
|
||||
# specialized version of truncate(s,0)
|
||||
s.size = 0
|
||||
s.ptr = 1
|
||||
if env !== nothing
|
||||
env_io = IOContext(s, env)
|
||||
for x in xs
|
||||
print(env_io, x)
|
||||
end
|
||||
else
|
||||
for x in xs
|
||||
print(s, x)
|
||||
end
|
||||
end
|
||||
String(resize!(s.data, s.size))
|
||||
end
|
||||
|
||||
string_with_env(env, xs...) = print_to_string(xs...; env=env)
|
||||
|
||||
"""
|
||||
string(xs...)
|
||||
|
||||
Create a string from any values using the [`print`](@ref) function.
|
||||
|
||||
```jldoctest
|
||||
julia> string("a", 1, true)
|
||||
"a1true"
|
||||
```
|
||||
"""
|
||||
string(xs...) = print_to_string(xs...)
|
||||
|
||||
print(io::IO, s::AbstractString) = (write(io, s); nothing)
|
||||
write(io::IO, s::AbstractString) = (len = 0; for c in s; len += write(io, c); end; len)
|
||||
show(io::IO, s::AbstractString) = print_quoted(io, s)
|
||||
|
||||
write(to::AbstractIOBuffer, s::SubString{String}) =
|
||||
s.endof==0 ? 0 : unsafe_write(to, pointer(s.string, s.offset + 1), UInt(nextind(s, s.endof) - 1))
|
||||
|
||||
## printing literal quoted string data ##
|
||||
|
||||
# this is the inverse of print_unescaped_chars(io, s, "\\\")
|
||||
|
||||
function print_quoted_literal(io, s::AbstractString)
|
||||
print(io, '"')
|
||||
for c = s; c == '"' ? print(io, "\\\"") : print(io, c); end
|
||||
print(io, '"')
|
||||
end
|
||||
|
||||
"""
|
||||
repr(x)
|
||||
|
||||
Create a string from any value using the [`showall`](@ref) function.
|
||||
"""
|
||||
function repr(x)
|
||||
s = IOBuffer()
|
||||
showall(s, x)
|
||||
String(take!(s))
|
||||
end
|
||||
|
||||
# IOBuffer views of a (byte)string:
|
||||
|
||||
"""
|
||||
IOBuffer(string::String)
|
||||
|
||||
Create a read-only `IOBuffer` on the data underlying the given string.
|
||||
|
||||
```jldoctest
|
||||
julia> io = IOBuffer("Haho");
|
||||
|
||||
julia> String(take!(io))
|
||||
"Haho"
|
||||
|
||||
julia> String(take!(io))
|
||||
"Haho"
|
||||
```
|
||||
"""
|
||||
IOBuffer(str::String) = IOBuffer(Vector{UInt8}(str))
|
||||
IOBuffer(s::SubString{String}) = IOBuffer(view(Vector{UInt8}(s.string), s.offset + 1 : s.offset + sizeof(s)))
|
||||
|
||||
# join is implemented using IO
|
||||
|
||||
"""
|
||||
join(io::IO, strings, delim, [last])
|
||||
|
||||
Join an array of `strings` into a single string, inserting the given delimiter between
|
||||
adjacent strings. If `last` is given, it will be used instead of `delim` between the last
|
||||
two strings. For example,
|
||||
|
||||
```jldoctest
|
||||
julia> join(["apples", "bananas", "pineapples"], ", ", " and ")
|
||||
"apples, bananas and pineapples"
|
||||
```
|
||||
|
||||
`strings` can be any iterable over elements `x` which are convertible to strings
|
||||
via `print(io::IOBuffer, x)`. `strings` will be printed to `io`.
|
||||
"""
|
||||
function join(io::IO, strings, delim, last)
|
||||
i = start(strings)
|
||||
if done(strings,i)
|
||||
return
|
||||
end
|
||||
str, i = next(strings,i)
|
||||
print(io, str)
|
||||
is_done = done(strings,i)
|
||||
while !is_done
|
||||
str, i = next(strings,i)
|
||||
is_done = done(strings,i)
|
||||
print(io, is_done ? last : delim)
|
||||
print(io, str)
|
||||
end
|
||||
end
|
||||
|
||||
function join(io::IO, strings, delim)
|
||||
i = start(strings)
|
||||
is_done = done(strings,i)
|
||||
while !is_done
|
||||
str, i = next(strings,i)
|
||||
is_done = done(strings,i)
|
||||
print(io, str)
|
||||
if !is_done
|
||||
print(io, delim)
|
||||
end
|
||||
end
|
||||
end
|
||||
join(io::IO, strings) = join(io, strings, "")
|
||||
join(args...) = sprint(join, args...)
|
||||
|
||||
## string escaping & unescaping ##
|
||||
|
||||
need_full_hex(s::AbstractString, i::Int) = !done(s,i) && isxdigit(next(s,i)[1])
|
||||
|
||||
escape_nul(s::AbstractString, i::Int) =
|
||||
!done(s,i) && '0' <= next(s,i)[1] <= '7' ? "\\x00" : "\\0"
|
||||
|
||||
"""
|
||||
escape_string([io,] str::AbstractString[, esc::AbstractString]) -> AbstractString
|
||||
|
||||
General escaping of traditional C and Unicode escape sequences.
|
||||
Any characters in `esc` are also escaped (with a backslash).
|
||||
See also [`unescape_string`](@ref).
|
||||
"""
|
||||
function escape_string(io, s::AbstractString, esc::AbstractString)
|
||||
i = start(s)
|
||||
while !done(s,i)
|
||||
c, j = next(s,i)
|
||||
c == '\0' ? print(io, escape_nul(s,j)) :
|
||||
c == '\e' ? print(io, "\\e") :
|
||||
c == '\\' ? print(io, "\\\\") :
|
||||
c in esc ? print(io, '\\', c) :
|
||||
'\a' <= c <= '\r' ? print(io, '\\', "abtnvfr"[Int(c)-6]) :
|
||||
isprint(c) ? print(io, c) :
|
||||
c <= '\x7f' ? print(io, "\\x", hex(c, 2)) :
|
||||
c <= '\uffff' ? print(io, "\\u", hex(c, need_full_hex(s,j) ? 4 : 2)) :
|
||||
print(io, "\\U", hex(c, need_full_hex(s,j) ? 8 : 4))
|
||||
i = j
|
||||
end
|
||||
end
|
||||
|
||||
escape_string(s::AbstractString) = sprint(endof(s), escape_string, s, "\"")
|
||||
|
||||
function print_quoted(io, s::AbstractString)
|
||||
print(io, '"')
|
||||
escape_string(io, s, "\"\$") #"# work around syntax highlighting problem
|
||||
print(io, '"')
|
||||
end
|
||||
|
||||
# bare minimum unescaping function unescapes only given characters
|
||||
|
||||
function print_unescaped_chars(io, s::AbstractString, esc::AbstractString)
|
||||
if !('\\' in esc)
|
||||
esc = string("\\", esc)
|
||||
end
|
||||
i = start(s)
|
||||
while !done(s,i)
|
||||
c, i = next(s,i)
|
||||
if c == '\\' && !done(s,i) && s[i] in esc
|
||||
c, i = next(s,i)
|
||||
end
|
||||
print(io, c)
|
||||
end
|
||||
end
|
||||
|
||||
unescape_chars(s::AbstractString, esc::AbstractString) =
|
||||
sprint(endof(s), print_unescaped_chars, s, esc)
|
||||
|
||||
# general unescaping of traditional C and Unicode escape sequences
|
||||
|
||||
"""
|
||||
unescape_string([io,] s::AbstractString) -> AbstractString
|
||||
|
||||
General unescaping of traditional C and Unicode escape sequences. Reverse of
|
||||
[`escape_string`](@ref).
|
||||
"""
|
||||
function unescape_string(io, s::AbstractString)
|
||||
i = start(s)
|
||||
while !done(s,i)
|
||||
c, i = next(s,i)
|
||||
if !done(s,i) && c == '\\'
|
||||
c, i = next(s,i)
|
||||
if c == 'x' || c == 'u' || c == 'U'
|
||||
n = k = 0
|
||||
m = c == 'x' ? 2 :
|
||||
c == 'u' ? 4 : 8
|
||||
while (k+=1) <= m && !done(s,i)
|
||||
c, j = next(s,i)
|
||||
n = '0' <= c <= '9' ? n<<4 + c-'0' :
|
||||
'a' <= c <= 'f' ? n<<4 + c-'a'+10 :
|
||||
'A' <= c <= 'F' ? n<<4 + c-'A'+10 : break
|
||||
i = j
|
||||
end
|
||||
if k == 1
|
||||
throw(ArgumentError("invalid $(m == 2 ? "hex (\\x)" :
|
||||
"unicode (\\u)") escape sequence used in $(repr(s))"))
|
||||
end
|
||||
if m == 2 # \x escape sequence
|
||||
write(io, UInt8(n))
|
||||
else
|
||||
print(io, Char(n))
|
||||
end
|
||||
elseif '0' <= c <= '7'
|
||||
k = 1
|
||||
n = c-'0'
|
||||
while (k+=1) <= 3 && !done(s,i)
|
||||
c, j = next(s,i)
|
||||
n = ('0' <= c <= '7') ? n<<3 + c-'0' : break
|
||||
i = j
|
||||
end
|
||||
if n > 255
|
||||
throw(ArgumentError("octal escape sequence out of range"))
|
||||
end
|
||||
write(io, UInt8(n))
|
||||
else
|
||||
print(io, c == 'a' ? '\a' :
|
||||
c == 'b' ? '\b' :
|
||||
c == 't' ? '\t' :
|
||||
c == 'n' ? '\n' :
|
||||
c == 'v' ? '\v' :
|
||||
c == 'f' ? '\f' :
|
||||
c == 'r' ? '\r' :
|
||||
c == 'e' ? '\e' : c)
|
||||
end
|
||||
else
|
||||
print(io, c)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
unescape_string(s::AbstractString) = sprint(endof(s), unescape_string, s)
|
||||
|
||||
macro b_str(s); :(Vector{UInt8}($(unescape_string(s)))); end
|
||||
|
||||
macro raw_str(s); s; end
|
||||
|
||||
## multiline strings ##
|
||||
|
||||
"""
|
||||
Calculate the width of leading blank space, and also return if string is blank
|
||||
|
||||
Returns:
|
||||
|
||||
* width of leading whitespace, flag if string is totally blank
|
||||
"""
|
||||
function indentation(str::AbstractString; tabwidth=8)
|
||||
count = 0
|
||||
for ch in str
|
||||
if ch == ' '
|
||||
count += 1
|
||||
elseif ch == '\t'
|
||||
count = div(count + tabwidth, tabwidth) * tabwidth
|
||||
else
|
||||
return count, false
|
||||
end
|
||||
end
|
||||
count, true
|
||||
end
|
||||
|
||||
"""
|
||||
Removes leading indentation from string
|
||||
|
||||
Returns:
|
||||
|
||||
* `String` of multiline string, with leading indentation of `indent` removed
|
||||
"""
|
||||
function unindent(str::AbstractString, indent::Int; tabwidth=8)
|
||||
indent == 0 && return str
|
||||
pos = start(str)
|
||||
endpos = endof(str)
|
||||
# Note: this loses the type of the original string
|
||||
buf = IOBuffer(StringVector(endpos), true, true)
|
||||
truncate(buf,0)
|
||||
cutting = true
|
||||
col = 0 # current column (0 based)
|
||||
while pos <= endpos
|
||||
ch, pos = next(str,pos)
|
||||
if cutting
|
||||
if ch == ' '
|
||||
col += 1
|
||||
elseif ch == '\t'
|
||||
col = div(col + tabwidth, tabwidth) * tabwidth
|
||||
elseif ch == '\n'
|
||||
# Now we need to output enough indentation
|
||||
for i = 1:col-indent
|
||||
write(buf, ' ')
|
||||
end
|
||||
col = 0
|
||||
write(buf, '\n')
|
||||
else
|
||||
cutting = false
|
||||
# Now we need to output enough indentation to get to
|
||||
# correct place
|
||||
for i = 1:col-indent
|
||||
write(buf, ' ')
|
||||
end
|
||||
col += 1
|
||||
write(buf, ch)
|
||||
end
|
||||
elseif ch == '\t' # Handle internal tabs
|
||||
upd = div(col + tabwidth, tabwidth) * tabwidth
|
||||
# output the number of spaces that would have been seen
|
||||
# with original indentation
|
||||
for i = 1:(upd-col)
|
||||
write(buf, ' ')
|
||||
end
|
||||
col = upd
|
||||
elseif ch == '\n'
|
||||
cutting = true
|
||||
col = 0
|
||||
write(buf, '\n')
|
||||
else
|
||||
col += 1
|
||||
write(buf, ch)
|
||||
end
|
||||
end
|
||||
# If we were still "cutting" when we hit the end of the string,
|
||||
# we need to output the right number of spaces for the indentation
|
||||
if cutting
|
||||
for i = 1:col-indent
|
||||
write(buf, ' ')
|
||||
end
|
||||
end
|
||||
String(take!(buf))
|
||||
end
|
||||
|
||||
function convert(::Type{String}, chars::AbstractVector{Char})
|
||||
sprint(length(chars), io->begin
|
||||
state = start(chars)
|
||||
while !done(chars, state)
|
||||
c, state = next(chars, state)
|
||||
if '\ud7ff' < c && c + 1024 < '\ue000'
|
||||
d, state = next(chars, state)
|
||||
if '\ud7ff' < d - 1024 && d < '\ue000'
|
||||
c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
|
||||
else
|
||||
write(io, c)
|
||||
c = d
|
||||
end
|
||||
end
|
||||
write(io, c)
|
||||
end
|
||||
end)
|
||||
end
|
||||
380
julia-0.6.2/share/julia/base/strings/search.jl
Normal file
380
julia-0.6.2/share/julia/base/strings/search.jl
Normal file
@@ -0,0 +1,380 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
const Chars = Union{Char,Tuple{Vararg{Char}},AbstractVector{Char},Set{Char}}
|
||||
|
||||
"""
|
||||
search(string::AbstractString, chars::Chars, [start::Integer])
|
||||
|
||||
Search for the first occurrence of the given characters within the given string. The second
|
||||
argument may be a single character, a vector or a set of characters, a string, or a regular
|
||||
expression (though regular expressions are only allowed on contiguous strings, such as ASCII
|
||||
or UTF-8 strings). The third argument optionally specifies a starting index. The return
|
||||
value is a range of indexes where the matching sequence is found, such that `s[search(s,x)] == x`:
|
||||
|
||||
`search(string, "substring")` = `start:end` such that `string[start:end] == "substring"`, or
|
||||
`0:-1` if unmatched.
|
||||
|
||||
`search(string, 'c')` = `index` such that `string[index] == 'c'`, or `0` if unmatched.
|
||||
|
||||
```jldoctest
|
||||
julia> search("Hello to the world", "z")
|
||||
0:-1
|
||||
|
||||
julia> search("JuliaLang","Julia")
|
||||
1:5
|
||||
```
|
||||
"""
|
||||
function search(s::AbstractString, c::Chars, i::Integer)
|
||||
if isempty(c)
|
||||
return 1 <= i <= nextind(s,endof(s)) ? i :
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
if i < 1 || i > nextind(s,endof(s))
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
while !done(s,i)
|
||||
d, j = next(s,i)
|
||||
if d in c
|
||||
return i
|
||||
end
|
||||
i = j
|
||||
end
|
||||
return 0
|
||||
end
|
||||
search(s::AbstractString, c::Chars) = search(s,c,start(s))
|
||||
|
||||
in(c::Char, s::AbstractString) = (search(s,c)!=0)
|
||||
|
||||
function _searchindex(s, t, i)
|
||||
if isempty(t)
|
||||
return 1 <= i <= nextind(s,endof(s)) ? i :
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
t1, j2 = next(t,start(t))
|
||||
while true
|
||||
i = search(s,t1,i)
|
||||
if i == 0 return 0 end
|
||||
c, ii = next(s,i)
|
||||
j = j2; k = ii
|
||||
matched = true
|
||||
while !done(t,j)
|
||||
if done(s,k)
|
||||
matched = false
|
||||
break
|
||||
end
|
||||
c, k = next(s,k)
|
||||
d, j = next(t,j)
|
||||
if c != d
|
||||
matched = false
|
||||
break
|
||||
end
|
||||
end
|
||||
if matched
|
||||
return i
|
||||
end
|
||||
i = ii
|
||||
end
|
||||
end
|
||||
|
||||
function _search_bloom_mask(c)
|
||||
UInt64(1) << (c & 63)
|
||||
end
|
||||
|
||||
_nthbyte(s::String, i) = codeunit(s, i)
|
||||
_nthbyte(a::ByteArray, i) = a[i]
|
||||
|
||||
function _searchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, i)
|
||||
n = sizeof(t)
|
||||
m = sizeof(s)
|
||||
|
||||
if n == 0
|
||||
return 1 <= i <= m+1 ? max(1, i) : 0
|
||||
elseif m == 0
|
||||
return 0
|
||||
elseif n == 1
|
||||
return search(s, _nthbyte(t,1), i)
|
||||
end
|
||||
|
||||
w = m - n
|
||||
if w < 0 || i - 1 > w
|
||||
return 0
|
||||
end
|
||||
|
||||
bloom_mask = UInt64(0)
|
||||
skip = n - 1
|
||||
tlast = _nthbyte(t,n)
|
||||
for j in 1:n
|
||||
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
|
||||
if _nthbyte(t,j) == tlast && j < n
|
||||
skip = n - j - 1
|
||||
end
|
||||
end
|
||||
|
||||
i -= 1
|
||||
while i <= w
|
||||
if _nthbyte(s,i+n) == tlast
|
||||
# check candidate
|
||||
j = 0
|
||||
while j < n - 1
|
||||
if _nthbyte(s,i+j+1) != _nthbyte(t,j+1)
|
||||
break
|
||||
end
|
||||
j += 1
|
||||
end
|
||||
|
||||
# match found
|
||||
if j == n - 1
|
||||
return i+1
|
||||
end
|
||||
|
||||
# no match, try to rule out the next character
|
||||
if i < w && bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
|
||||
i += n
|
||||
else
|
||||
i += skip
|
||||
end
|
||||
elseif i < w
|
||||
if bloom_mask & _search_bloom_mask(_nthbyte(s,i+n+1)) == 0
|
||||
i += n
|
||||
end
|
||||
end
|
||||
i += 1
|
||||
end
|
||||
|
||||
0
|
||||
end
|
||||
|
||||
searchindex(s::ByteArray, t::ByteArray, i) = _searchindex(s,t,i)
|
||||
|
||||
"""
|
||||
searchindex(s::AbstractString, substring, [start::Integer])
|
||||
|
||||
Similar to [`search`](@ref), but return only the start index at which
|
||||
the substring is found, or `0` if it is not.
|
||||
|
||||
```jldoctest
|
||||
julia> searchindex("Hello to the world", "z")
|
||||
0
|
||||
|
||||
julia> searchindex("JuliaLang","Julia")
|
||||
1
|
||||
|
||||
julia> searchindex("JuliaLang","Lang")
|
||||
6
|
||||
```
|
||||
"""
|
||||
searchindex(s::AbstractString, t::AbstractString, i::Integer) = _searchindex(s,t,i)
|
||||
searchindex(s::AbstractString, t::AbstractString) = searchindex(s,t,start(s))
|
||||
searchindex(s::AbstractString, c::Char, i::Integer) = _searchindex(s,c,i)
|
||||
searchindex(s::AbstractString, c::Char) = searchindex(s,c,start(s))
|
||||
|
||||
function searchindex(s::String, t::String, i::Integer=1)
|
||||
# Check for fast case of a single byte
|
||||
# (for multi-byte UTF-8 sequences, use searchindex on byte arrays instead)
|
||||
if endof(t) == 1
|
||||
search(s, t[1], i)
|
||||
else
|
||||
_searchindex(s, t, i)
|
||||
end
|
||||
end
|
||||
|
||||
function _search(s, t, i::Integer)
|
||||
idx = searchindex(s,t,i)
|
||||
if isempty(t)
|
||||
idx:idx-1
|
||||
else
|
||||
idx:(idx > 0 ? idx + endof(t) - 1 : -1)
|
||||
end
|
||||
end
|
||||
|
||||
search(s::AbstractString, t::AbstractString, i::Integer=start(s)) = _search(s, t, i)
|
||||
search(s::ByteArray, t::ByteArray, i::Integer=start(s)) = _search(s, t, i)
|
||||
|
||||
function rsearch(s::AbstractString, c::Chars)
|
||||
j = search(RevString(s), c)
|
||||
j == 0 && return 0
|
||||
endof(s)-j+1
|
||||
end
|
||||
|
||||
"""
|
||||
rsearch(s::AbstractString, chars::Chars, [start::Integer])
|
||||
|
||||
Similar to [`search`](@ref), but returning the last occurrence of the given characters within the
|
||||
given string, searching in reverse from `start`.
|
||||
|
||||
```jldoctest
|
||||
julia> rsearch("aaabbb","b")
|
||||
6:6
|
||||
```
|
||||
"""
|
||||
function rsearch(s::AbstractString, c::Chars, i::Integer)
|
||||
e = endof(s)
|
||||
j = search(RevString(s), c, e-i+1)
|
||||
j == 0 && return 0
|
||||
e-j+1
|
||||
end
|
||||
|
||||
function _rsearchindex(s, t, i)
|
||||
if isempty(t)
|
||||
return 1 <= i <= nextind(s,endof(s)) ? i :
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
t = RevString(t)
|
||||
rs = RevString(s)
|
||||
l = endof(s)
|
||||
t1, j2 = next(t,start(t))
|
||||
while true
|
||||
i = rsearch(s,t1,i)
|
||||
if i == 0 return 0 end
|
||||
c, ii = next(rs,l-i+1)
|
||||
j = j2; k = ii
|
||||
matched = true
|
||||
while !done(t,j)
|
||||
if done(rs,k)
|
||||
matched = false
|
||||
break
|
||||
end
|
||||
c, k = next(rs,k)
|
||||
d, j = next(t,j)
|
||||
if c != d
|
||||
matched = false
|
||||
break
|
||||
end
|
||||
end
|
||||
if matched
|
||||
return nextind(s,l-k+1)
|
||||
end
|
||||
i = l-ii+1
|
||||
end
|
||||
end
|
||||
|
||||
function _rsearchindex(s::Union{String,ByteArray}, t::Union{String,ByteArray}, k)
|
||||
n = sizeof(t)
|
||||
m = sizeof(s)
|
||||
|
||||
if n == 0
|
||||
return 0 <= k <= m ? max(k, 1) : 0
|
||||
elseif m == 0
|
||||
return 0
|
||||
elseif n == 1
|
||||
return rsearch(s, _nthbyte(t,1), k)
|
||||
end
|
||||
|
||||
w = m - n
|
||||
if w < 0 || k <= 0
|
||||
return 0
|
||||
end
|
||||
|
||||
bloom_mask = UInt64(0)
|
||||
skip = n - 1
|
||||
tfirst = _nthbyte(t,1)
|
||||
for j in n:-1:1
|
||||
bloom_mask |= _search_bloom_mask(_nthbyte(t,j))
|
||||
if _nthbyte(t,j) == tfirst && j > 1
|
||||
skip = j - 2
|
||||
end
|
||||
end
|
||||
|
||||
i = min(k - n + 1, w + 1)
|
||||
while i > 0
|
||||
if _nthbyte(s,i) == tfirst
|
||||
# check candidate
|
||||
j = 1
|
||||
while j < n
|
||||
if _nthbyte(s,i+j) != _nthbyte(t,j+1)
|
||||
break
|
||||
end
|
||||
j += 1
|
||||
end
|
||||
|
||||
# match found
|
||||
if j == n
|
||||
return i
|
||||
end
|
||||
|
||||
# no match, try to rule out the next character
|
||||
if i > 1 && bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
|
||||
i -= n
|
||||
else
|
||||
i -= skip
|
||||
end
|
||||
elseif i > 1
|
||||
if bloom_mask & _search_bloom_mask(_nthbyte(s,i-1)) == 0
|
||||
i -= n
|
||||
end
|
||||
end
|
||||
i -= 1
|
||||
end
|
||||
|
||||
0
|
||||
end
|
||||
|
||||
rsearchindex(s::ByteArray, t::ByteArray, i::Integer) = _rsearchindex(s,t,i)
|
||||
|
||||
"""
|
||||
rsearchindex(s::AbstractString, substring, [start::Integer])
|
||||
|
||||
Similar to [`rsearch`](@ref), but return only the start index at which the substring is found, or `0` if it is not.
|
||||
|
||||
```jldoctest
|
||||
julia> rsearchindex("aaabbb","b")
|
||||
6
|
||||
|
||||
julia> rsearchindex("aaabbb","a")
|
||||
3
|
||||
```
|
||||
"""
|
||||
rsearchindex(s::AbstractString, t::AbstractString, i::Integer) = _rsearchindex(s,t,i)
|
||||
rsearchindex(s::AbstractString, t::AbstractString) = (isempty(s) && isempty(t)) ? 1 : rsearchindex(s,t,endof(s))
|
||||
|
||||
function rsearchindex(s::String, t::String)
|
||||
# Check for fast case of a single byte
|
||||
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
|
||||
if endof(t) == 1
|
||||
rsearch(s, t[1])
|
||||
else
|
||||
_rsearchindex(s, t, sizeof(s))
|
||||
end
|
||||
end
|
||||
|
||||
function rsearchindex(s::String, t::String, i::Integer)
|
||||
# Check for fast case of a single byte
|
||||
# (for multi-byte UTF-8 sequences, use rsearchindex instead)
|
||||
if endof(t) == 1
|
||||
rsearch(s, t[1], i)
|
||||
elseif endof(t) != 0
|
||||
_rsearchindex(s, t, nextind(s, i)-1)
|
||||
elseif i > sizeof(s)
|
||||
return 0
|
||||
elseif i == 0
|
||||
return 1
|
||||
else
|
||||
return i
|
||||
end
|
||||
end
|
||||
|
||||
function _rsearch(s, t, i::Integer)
|
||||
idx = rsearchindex(s,t,i)
|
||||
if isempty(t)
|
||||
idx:idx-1
|
||||
else
|
||||
idx:(idx > 0 ? idx + endof(t) - 1 : -1)
|
||||
end
|
||||
end
|
||||
|
||||
rsearch(s::AbstractString, t::AbstractString, i::Integer=endof(s)) = _rsearch(s, t, i)
|
||||
rsearch(s::ByteArray, t::ByteArray, i::Integer=endof(s)) = _rsearch(s, t, i)
|
||||
|
||||
"""
|
||||
contains(haystack::AbstractString, needle::AbstractString)
|
||||
|
||||
Determine whether the second argument is a substring of the first.
|
||||
|
||||
```jldoctest
|
||||
julia> contains("JuliaLang is pretty cool!", "Julia")
|
||||
true
|
||||
```
|
||||
"""
|
||||
contains(haystack::AbstractString, needle::AbstractString) = searchindex(haystack,needle)!=0
|
||||
|
||||
in(::AbstractString, ::AbstractString) = error("use contains(x,y) for string containment")
|
||||
438
julia-0.6.2/share/julia/base/strings/string.jl
Normal file
438
julia-0.6.2/share/julia/base/strings/string.jl
Normal file
@@ -0,0 +1,438 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
const ByteArray = Union{Vector{UInt8},Vector{Int8}}
|
||||
|
||||
## constructors and conversions ##
|
||||
|
||||
# String constructor docstring from boot.jl, workaround for #16730
|
||||
# and the unavailability of @doc in boot.jl context.
|
||||
"""
|
||||
String(v::Vector{UInt8})
|
||||
|
||||
Create a new `String` from a vector `v` of bytes containing
|
||||
UTF-8 encoded characters. This function takes "ownership" of
|
||||
the array, which means that you should not subsequently modify
|
||||
`v` (since strings are supposed to be immutable in Julia) for
|
||||
as long as the string exists.
|
||||
|
||||
If you need to subsequently modify `v`, use `String(copy(v))` instead.
|
||||
"""
|
||||
function String(v::Array{UInt8,1})
|
||||
ccall(:jl_array_to_string, Ref{String}, (Any,), v)
|
||||
end
|
||||
|
||||
"""
|
||||
unsafe_string(p::Ptr{UInt8}, [length::Integer])
|
||||
|
||||
Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8.
|
||||
(The pointer can be safely freed afterwards.) If `length` is specified
|
||||
(the length of the data in bytes), the string does not have to be NUL-terminated.
|
||||
|
||||
This function is labelled "unsafe" because it will crash if `p` is not
|
||||
a valid memory address to data of the requested length.
|
||||
"""
|
||||
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer)
|
||||
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
|
||||
ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len)
|
||||
end
|
||||
function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}})
|
||||
p == C_NULL && throw(ArgumentError("cannot convert NULL to string"))
|
||||
ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p)
|
||||
end
|
||||
|
||||
_string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n)
|
||||
|
||||
convert(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s)
|
||||
convert(::Type{String}, s::String) = s
|
||||
convert(::Type{String}, v::Vector{UInt8}) = String(v)
|
||||
|
||||
## low-level functions ##
|
||||
|
||||
pointer(s::String) = unsafe_convert(Ptr{UInt8}, s)
|
||||
pointer(s::String, i::Integer) = pointer(s)+(i-1)
|
||||
|
||||
sizeof(s::String) = s.len
|
||||
|
||||
"""
|
||||
codeunit(s::AbstractString, i::Integer)
|
||||
|
||||
Get the `i`th code unit of an encoded string. For example,
|
||||
returns the `i`th byte of the representation of a UTF-8 string.
|
||||
"""
|
||||
codeunit(s::AbstractString, i::Integer)
|
||||
|
||||
@inline function codeunit(s::String, i::Integer)
|
||||
@boundscheck if (i < 1) | (i > s.len)
|
||||
throw(BoundsError(s,i))
|
||||
end
|
||||
unsafe_load(pointer(s),i)
|
||||
end
|
||||
|
||||
write(io::IO, s::String) = unsafe_write(io, pointer(s), reinterpret(UInt, s.len))
|
||||
|
||||
## comparison ##
|
||||
|
||||
function cmp(a::String, b::String)
|
||||
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
|
||||
a, b, min(a.len,b.len))
|
||||
return c < 0 ? -1 : c > 0 ? +1 : cmp(a.len,b.len)
|
||||
end
|
||||
|
||||
function ==(a::String, b::String)
|
||||
a.len == b.len && 0 == ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, a.len)
|
||||
end
|
||||
|
||||
## prevind and nextind ##
|
||||
|
||||
function prevind(s::String, i::Integer)
|
||||
j = Int(i)
|
||||
e = s.len
|
||||
if j > e
|
||||
return endof(s)
|
||||
end
|
||||
j -= 1
|
||||
@inbounds while j > 0 && is_valid_continuation(codeunit(s,j))
|
||||
j -= 1
|
||||
end
|
||||
j
|
||||
end
|
||||
|
||||
function nextind(s::String, i::Integer)
|
||||
j = Int(i)
|
||||
if j < 1
|
||||
return 1
|
||||
end
|
||||
e = s.len
|
||||
j += 1
|
||||
@inbounds while j <= e && is_valid_continuation(codeunit(s,j))
|
||||
j += 1
|
||||
end
|
||||
j
|
||||
end
|
||||
|
||||
## checking UTF-8 & ACSII validity ##
|
||||
|
||||
byte_string_classify(data::Vector{UInt8}) =
|
||||
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), data, length(data))
|
||||
byte_string_classify(s::String) =
|
||||
ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, s.len)
|
||||
# 0: neither valid ASCII nor UTF-8
|
||||
# 1: valid ASCII
|
||||
# 2: valid UTF-8
|
||||
|
||||
isvalid(::Type{String}, s::Union{Vector{UInt8},String}) = byte_string_classify(s) != 0
|
||||
isvalid(s::String) = isvalid(String, s)
|
||||
|
||||
## basic UTF-8 decoding & iteration ##
|
||||
|
||||
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
|
||||
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
|
||||
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
|
||||
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
|
||||
|
||||
const utf8_offset = [
|
||||
0x00000000, 0x00003080,
|
||||
0x000e2080, 0x03c82080,
|
||||
0xfa082080, 0x82082080,
|
||||
]
|
||||
|
||||
const utf8_trailing = [
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
|
||||
]
|
||||
|
||||
## required core functionality ##
|
||||
|
||||
function endof(s::String)
|
||||
p = pointer(s)
|
||||
i = s.len
|
||||
while i > 0 && is_valid_continuation(unsafe_load(p,i))
|
||||
i -= 1
|
||||
end
|
||||
i
|
||||
end
|
||||
|
||||
function length(s::String)
|
||||
p = pointer(s)
|
||||
cnum = 0
|
||||
for i = 1:s.len
|
||||
cnum += !is_valid_continuation(unsafe_load(p,i))
|
||||
end
|
||||
cnum
|
||||
end
|
||||
|
||||
@noinline function slow_utf8_next(p::Ptr{UInt8}, b::UInt8, i::Int, l::Int)
|
||||
if is_valid_continuation(b)
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, unsafe_load(p,i)))
|
||||
end
|
||||
trailing = utf8_trailing[b + 1]
|
||||
if l < i + trailing
|
||||
return '\ufffd', i+1
|
||||
end
|
||||
c::UInt32 = 0
|
||||
for j = 1:(trailing + 1)
|
||||
c <<= 6
|
||||
c += unsafe_load(p,i)
|
||||
i += 1
|
||||
end
|
||||
c -= utf8_offset[trailing + 1]
|
||||
return Char(c), i
|
||||
end
|
||||
|
||||
# This implementation relies on `next` returning a value past the end of the
|
||||
# String's underlying data, which is true for valid Strings
|
||||
done(s::String, state) = state > s.len
|
||||
|
||||
@inline function next(s::String, i::Int)
|
||||
# function is split into this critical fast-path
|
||||
# for pure ascii data, such as parsing numbers,
|
||||
# and a longer function that can handle any utf8 data
|
||||
@boundscheck if (i < 1) | (i > s.len)
|
||||
throw(BoundsError(s,i))
|
||||
end
|
||||
p = pointer(s)
|
||||
b = unsafe_load(p, i)
|
||||
if b < 0x80
|
||||
return Char(b), i + 1
|
||||
end
|
||||
return slow_utf8_next(p, b, i, s.len)
|
||||
end
|
||||
|
||||
function first_utf8_byte(ch::Char)
|
||||
c = UInt32(ch)
|
||||
b = c < 0x80 ? c%UInt8 :
|
||||
c < 0x800 ? ((c>>6) | 0xc0)%UInt8 :
|
||||
c < 0x10000 ? ((c>>12) | 0xe0)%UInt8 :
|
||||
((c>>18) | 0xf0)%UInt8
|
||||
return b
|
||||
end
|
||||
|
||||
function reverseind(s::String, i::Integer)
|
||||
j = s.len + 1 - i
|
||||
p = pointer(s)
|
||||
while is_valid_continuation(unsafe_load(p,j))
|
||||
j -= 1
|
||||
end
|
||||
return j
|
||||
end
|
||||
|
||||
## overload methods for efficiency ##
|
||||
|
||||
isvalid(s::String, i::Integer) =
|
||||
(1 <= i <= s.len) && !is_valid_continuation(unsafe_load(pointer(s),i))
|
||||
|
||||
function getindex(s::String, r::UnitRange{Int})
|
||||
isempty(r) && return ""
|
||||
i, j = first(r), last(r)
|
||||
l = s.len
|
||||
if i < 1 || i > l
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
@inbounds si = codeunit(s, i)
|
||||
if is_valid_continuation(si)
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, si))
|
||||
end
|
||||
if j > l
|
||||
throw(BoundsError())
|
||||
end
|
||||
j = nextind(s,j)-1
|
||||
unsafe_string(pointer(s,i), j-i+1)
|
||||
end
|
||||
|
||||
function search(s::String, c::Char, i::Integer = 1)
|
||||
if i < 1 || i > sizeof(s)
|
||||
i == sizeof(s) + 1 && return 0
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
if is_valid_continuation(codeunit(s,i))
|
||||
throw(UnicodeError(UTF_ERR_INVALID_INDEX, i, codeunit(s,i)))
|
||||
end
|
||||
c < Char(0x80) && return search(s, c%UInt8, i)
|
||||
while true
|
||||
i = search(s, first_utf8_byte(c), i)
|
||||
(i==0 || s[i] == c) && return i
|
||||
i = next(s,i)[2]
|
||||
end
|
||||
end
|
||||
|
||||
function search(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = 1)
|
||||
if i < 1
|
||||
throw(BoundsError(a, i))
|
||||
end
|
||||
n = sizeof(a)
|
||||
if i > n
|
||||
return i == n+1 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
p = pointer(a)
|
||||
q = ccall(:memchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p+i-1, b, n-i+1)
|
||||
q == C_NULL ? 0 : Int(q-p+1)
|
||||
end
|
||||
|
||||
function search(a::ByteArray, b::Char, i::Integer = 1)
|
||||
if isascii(b)
|
||||
search(a,UInt8(b),i)
|
||||
else
|
||||
search(a,Vector{UInt8}(string(b)),i).start
|
||||
end
|
||||
end
|
||||
|
||||
function rsearch(s::String, c::Char, i::Integer = s.len)
|
||||
c < Char(0x80) && return rsearch(s, c%UInt8, i)
|
||||
b = first_utf8_byte(c)
|
||||
while true
|
||||
i = rsearch(s, b, i)
|
||||
(i==0 || s[i] == c) && return i
|
||||
i = prevind(s,i)
|
||||
end
|
||||
end
|
||||
|
||||
function rsearch(a::Union{String,ByteArray}, b::Union{Int8,UInt8}, i::Integer = s.len)
|
||||
if i < 1
|
||||
return i == 0 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
n = sizeof(a)
|
||||
if i > n
|
||||
return i == n+1 ? 0 : throw(BoundsError(a, i))
|
||||
end
|
||||
p = pointer(a)
|
||||
q = ccall(:memrchr, Ptr{UInt8}, (Ptr{UInt8}, Int32, Csize_t), p, b, i)
|
||||
q == C_NULL ? 0 : Int(q-p+1)
|
||||
end
|
||||
|
||||
function rsearch(a::ByteArray, b::Char, i::Integer = length(a))
|
||||
if isascii(b)
|
||||
rsearch(a,UInt8(b),i)
|
||||
else
|
||||
rsearch(a,Vector{UInt8}(string(b)),i).start
|
||||
end
|
||||
end
|
||||
|
||||
## optimized concatenation, reverse, repeat ##
|
||||
|
||||
function string(a::String...)
|
||||
if length(a) == 1
|
||||
return a[1]::String
|
||||
end
|
||||
n = 0
|
||||
for str in a
|
||||
n += str.len
|
||||
end
|
||||
out = _string_n(n)
|
||||
offs = 1
|
||||
for str in a
|
||||
unsafe_copy!(pointer(out,offs), pointer(str), str.len)
|
||||
offs += str.len
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
# UTF-8 encoding length of a character
|
||||
function codelen(d::Char)
|
||||
c = UInt32(d)
|
||||
if c < 0x80
|
||||
return 1
|
||||
elseif c < 0x800
|
||||
return 2
|
||||
elseif c < 0x10000
|
||||
return 3
|
||||
elseif c < 0x110000
|
||||
return 4
|
||||
end
|
||||
return 3 # '\ufffd'
|
||||
end
|
||||
|
||||
function string(a::Union{String,Char}...)
|
||||
n = 0
|
||||
for d in a
|
||||
if isa(d,Char)
|
||||
n += codelen(d::Char)
|
||||
else
|
||||
n += (d::String).len
|
||||
end
|
||||
end
|
||||
out = _string_n(n)
|
||||
offs = 1
|
||||
p = pointer(out)
|
||||
for d in a
|
||||
if isa(d,Char)
|
||||
c = UInt32(d::Char)
|
||||
if c < 0x80
|
||||
unsafe_store!(p, c%UInt8, offs); offs += 1
|
||||
elseif c < 0x800
|
||||
unsafe_store!(p, (( c >> 6 ) | 0xC0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
elseif c < 0x10000
|
||||
unsafe_store!(p, (( c >> 12 ) | 0xE0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
elseif c < 0x110000
|
||||
unsafe_store!(p, (( c >> 18 ) | 0xF0)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 12) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (((c >> 6) & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
unsafe_store!(p, (( c & 0x3F ) | 0x80)%UInt8, offs); offs += 1
|
||||
else
|
||||
# '\ufffd'
|
||||
unsafe_store!(p, 0xef, offs); offs += 1
|
||||
unsafe_store!(p, 0xbf, offs); offs += 1
|
||||
unsafe_store!(p, 0xbd, offs); offs += 1
|
||||
end
|
||||
else
|
||||
l = (d::String).len
|
||||
unsafe_copy!(pointer(out,offs), pointer(d::String), l)
|
||||
offs += l
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function reverse(s::String)
|
||||
dat = convert(Vector{UInt8},s)
|
||||
n = length(dat)
|
||||
n <= 1 && return s
|
||||
buf = StringVector(n)
|
||||
out = n
|
||||
pos = 1
|
||||
@inbounds while out > 0
|
||||
ch = dat[pos]
|
||||
if ch > 0xdf
|
||||
if ch < 0xf0
|
||||
(out -= 3) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out + 1], buf[out + 2], buf[out + 3] = ch, dat[pos + 1], dat[pos + 2]
|
||||
pos += 3
|
||||
else
|
||||
(out -= 4) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out+1], buf[out+2], buf[out+3], buf[out+4] = ch, dat[pos+1], dat[pos+2], dat[pos+3]
|
||||
pos += 4
|
||||
end
|
||||
elseif ch > 0x7f
|
||||
(out -= 2) < 0 && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
|
||||
buf[out + 1], buf[out + 2] = ch, dat[pos + 1]
|
||||
pos += 2
|
||||
else
|
||||
buf[out] = ch
|
||||
out -= 1
|
||||
pos += 1
|
||||
end
|
||||
end
|
||||
String(buf)
|
||||
end
|
||||
|
||||
function repeat(s::String, r::Integer)
|
||||
r < 0 && throw(ArgumentError("can't repeat a string $r times"))
|
||||
n = s.len
|
||||
out = _string_n(n*r)
|
||||
if n == 1 # common case: repeating a single ASCII char
|
||||
ccall(:memset, Ptr{Void}, (Ptr{UInt8}, Cint, Csize_t), out, unsafe_load(pointer(s)), r)
|
||||
else
|
||||
for i=1:r
|
||||
unsafe_copy!(pointer(out, 1+(i-1)*n), pointer(s), n)
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
10
julia-0.6.2/share/julia/base/strings/strings.jl
Normal file
10
julia-0.6.2/share/julia/base/strings/strings.jl
Normal file
@@ -0,0 +1,10 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
include("strings/errors.jl")
|
||||
include("strings/types.jl")
|
||||
include("strings/basic.jl")
|
||||
include("strings/search.jl")
|
||||
include("strings/util.jl")
|
||||
include("strings/io.jl")
|
||||
include("strings/utf8proc.jl")
|
||||
importall .UTF8proc
|
||||
157
julia-0.6.2/share/julia/base/strings/types.jl
Normal file
157
julia-0.6.2/share/julia/base/strings/types.jl
Normal file
@@ -0,0 +1,157 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
# SubString and RevString types
|
||||
|
||||
## substrings reference original strings ##
|
||||
|
||||
struct SubString{T<:AbstractString} <: AbstractString
|
||||
string::T
|
||||
offset::Int
|
||||
endof::Int
|
||||
|
||||
function SubString{T}(s::T, i::Int, j::Int) where T<:AbstractString
|
||||
if i > endof(s) || j<i
|
||||
return new(s, i-1, 0)
|
||||
else
|
||||
if !isvalid(s,i)
|
||||
throw(ArgumentError("invalid SubString index"))
|
||||
end
|
||||
|
||||
while !isvalid(s,j) && j > i
|
||||
j -= 1
|
||||
end
|
||||
|
||||
o = i-1
|
||||
new(s, o, max(0, j-o))
|
||||
end
|
||||
end
|
||||
end
|
||||
SubString(s::T, i::Int, j::Int) where {T<:AbstractString} = SubString{T}(s, i, j)
|
||||
SubString(s::SubString, i::Int, j::Int) = SubString(s.string, s.offset+i, s.offset+j)
|
||||
SubString(s::AbstractString, i::Integer, j::Integer) = SubString(s, Int(i), Int(j))
|
||||
SubString(s::AbstractString, i::Integer) = SubString(s, i, endof(s))
|
||||
|
||||
sizeof(s::SubString{String}) = s.endof == 0 ? 0 : nextind(s, s.endof) - 1
|
||||
|
||||
# TODO: length(s::SubString) = ??
|
||||
# default implementation will work but it's slow
|
||||
# can this be delegated efficiently somehow?
|
||||
# that may require additional string interfaces
|
||||
length(s::SubString{<:DirectIndexString}) = endof(s)
|
||||
|
||||
function length(s::SubString{String})
|
||||
return s.endof==0 ? 0 : Int(ccall(:u8_charnum, Csize_t, (Ptr{UInt8}, Csize_t),
|
||||
pointer(s), nextind(s, s.endof) - 1))
|
||||
end
|
||||
|
||||
function next(s::SubString, i::Int)
|
||||
if i < 1 || i > s.endof
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
c, i = next(s.string, i+s.offset)
|
||||
c, i-s.offset
|
||||
end
|
||||
|
||||
function getindex(s::SubString, i::Int)
|
||||
if i < 1 || i > s.endof
|
||||
throw(BoundsError(s, i))
|
||||
end
|
||||
getindex(s.string, i+s.offset)
|
||||
end
|
||||
|
||||
endof(s::SubString) = s.endof
|
||||
|
||||
function isvalid(s::SubString, i::Integer)
|
||||
return (start(s) <= i <= endof(s)) && isvalid(s.string, s.offset+i)
|
||||
end
|
||||
|
||||
isvalid(s::SubString{<:DirectIndexString}, i::Integer) = (start(s) <= i <= endof(s))
|
||||
|
||||
ind2chr(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end
|
||||
chr2ind(s::SubString{<:DirectIndexString}, i::Integer) = begin checkbounds(s,i); i end
|
||||
|
||||
nextind(s::SubString, i::Integer) = nextind(s.string, i+s.offset)-s.offset
|
||||
prevind(s::SubString, i::Integer) = prevind(s.string, i+s.offset)-s.offset
|
||||
|
||||
convert(::Type{SubString{T}}, s::T) where {T<:AbstractString} = SubString(s, 1, endof(s))
|
||||
|
||||
String(p::SubString{String}) =
|
||||
unsafe_string(pointer(p.string, p.offset+1), nextind(p, p.endof)-1)
|
||||
|
||||
function getindex(s::AbstractString, r::UnitRange{Int})
|
||||
checkbounds(s, r) || throw(BoundsError(s, r))
|
||||
SubString(s, first(r), last(r))
|
||||
end
|
||||
|
||||
function cmp(a::SubString{String}, b::SubString{String})
|
||||
na = sizeof(a)
|
||||
nb = sizeof(b)
|
||||
c = ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt),
|
||||
pointer(a), pointer(b), min(na,nb))
|
||||
c < 0 ? -1 : c > 0 ? +1 : cmp(na,nb)
|
||||
end
|
||||
|
||||
# don't make unnecessary copies when passing substrings to C functions
|
||||
cconvert(::Type{Ptr{UInt8}}, s::SubString{String}) = s
|
||||
cconvert(::Type{Ptr{Int8}}, s::SubString{String}) = s
|
||||
function unsafe_convert(::Type{Ptr{R}}, s::SubString{String}) where R<:Union{Int8, UInt8}
|
||||
convert(Ptr{R}, pointer(s.string)) + s.offset
|
||||
end
|
||||
|
||||
## reversed strings without data movement ##
|
||||
|
||||
struct RevString{T<:AbstractString} <: AbstractString
|
||||
string::T
|
||||
end
|
||||
|
||||
endof(s::RevString) = endof(s.string)
|
||||
length(s::RevString) = length(s.string)
|
||||
sizeof(s::RevString) = sizeof(s.string)
|
||||
|
||||
function next(s::RevString, i::Int)
|
||||
n = endof(s); j = n-i+1
|
||||
(s.string[j], n-prevind(s.string,j)+1)
|
||||
end
|
||||
|
||||
"""
|
||||
reverse(s::AbstractString) -> AbstractString
|
||||
|
||||
Reverses a string.
|
||||
```jldoctest
|
||||
julia> reverse("JuliaLang")
|
||||
"gnaLailuJ"
|
||||
```
|
||||
"""
|
||||
reverse(s::AbstractString) = RevString(s)
|
||||
reverse(s::RevString) = s.string
|
||||
|
||||
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
|
||||
|
||||
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
|
||||
reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i
|
||||
reverseind(s::RevString, i::Integer) = endof(s) - i + 1
|
||||
reverseind(s::SubString{String}, i::Integer) =
|
||||
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
|
||||
|
||||
function repeat(s::AbstractString, r::Integer)
|
||||
r < 0 ? throw(ArgumentError("can't repeat a string $r times")) :
|
||||
r == 0 ? "" :
|
||||
r == 1 ? s :
|
||||
repeat(convert(String, s), r)
|
||||
end
|
||||
|
||||
"""
|
||||
^(s::AbstractString, n::Integer)
|
||||
|
||||
Repeat `n` times the string `s`.
|
||||
The [`repeat`](@ref) function is an alias to this operator.
|
||||
|
||||
```jldoctest
|
||||
julia> "Test "^3
|
||||
"Test Test Test "
|
||||
```
|
||||
"""
|
||||
(^)(s::AbstractString, r::Integer) = repeat(s,r)
|
||||
|
||||
pointer(x::SubString{String}) = pointer(x.string) + x.offset
|
||||
pointer(x::SubString{String}, i::Integer) = pointer(x.string) + x.offset + (i-1)
|
||||
398
julia-0.6.2/share/julia/base/strings/utf8proc.jl
Normal file
398
julia-0.6.2/share/julia/base/strings/utf8proc.jl
Normal file
@@ -0,0 +1,398 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
# Various Unicode functionality from the utf8proc library
|
||||
module UTF8proc
|
||||
|
||||
import Base: show, ==, hash, string, Symbol, isless, length, eltype, start, next, done, convert, isvalid, lowercase, uppercase, titlecase
|
||||
|
||||
export isgraphemebreak, category_code, category_abbrev, category_string
|
||||
|
||||
# also exported by Base:
|
||||
export normalize_string, graphemes, is_assigned_char, charwidth, isvalid,
|
||||
islower, isupper, isalpha, isdigit, isnumber, isalnum,
|
||||
iscntrl, ispunct, isspace, isprint, isgraph
|
||||
|
||||
# whether codepoints are valid Unicode scalar values, i.e. 0-0xd7ff, 0xe000-0x10ffff
|
||||
isvalid(::Type{Char}, ch::Unsigned) = !((ch - 0xd800 < 0x800) | (ch > 0x10ffff))
|
||||
isvalid(::Type{Char}, ch::Integer) = isvalid(Char, Unsigned(ch))
|
||||
isvalid(::Type{Char}, ch::Char) = isvalid(Char, UInt32(ch))
|
||||
|
||||
isvalid(ch::Char) = isvalid(Char, ch)
|
||||
|
||||
# utf8 category constants
|
||||
const UTF8PROC_CATEGORY_CN = 0
|
||||
const UTF8PROC_CATEGORY_LU = 1
|
||||
const UTF8PROC_CATEGORY_LL = 2
|
||||
const UTF8PROC_CATEGORY_LT = 3
|
||||
const UTF8PROC_CATEGORY_LM = 4
|
||||
const UTF8PROC_CATEGORY_LO = 5
|
||||
const UTF8PROC_CATEGORY_MN = 6
|
||||
const UTF8PROC_CATEGORY_MC = 7
|
||||
const UTF8PROC_CATEGORY_ME = 8
|
||||
const UTF8PROC_CATEGORY_ND = 9
|
||||
const UTF8PROC_CATEGORY_NL = 10
|
||||
const UTF8PROC_CATEGORY_NO = 11
|
||||
const UTF8PROC_CATEGORY_PC = 12
|
||||
const UTF8PROC_CATEGORY_PD = 13
|
||||
const UTF8PROC_CATEGORY_PS = 14
|
||||
const UTF8PROC_CATEGORY_PE = 15
|
||||
const UTF8PROC_CATEGORY_PI = 16
|
||||
const UTF8PROC_CATEGORY_PF = 17
|
||||
const UTF8PROC_CATEGORY_PO = 18
|
||||
const UTF8PROC_CATEGORY_SM = 19
|
||||
const UTF8PROC_CATEGORY_SC = 20
|
||||
const UTF8PROC_CATEGORY_SK = 21
|
||||
const UTF8PROC_CATEGORY_SO = 22
|
||||
const UTF8PROC_CATEGORY_ZS = 23
|
||||
const UTF8PROC_CATEGORY_ZL = 24
|
||||
const UTF8PROC_CATEGORY_ZP = 25
|
||||
const UTF8PROC_CATEGORY_CC = 26
|
||||
const UTF8PROC_CATEGORY_CF = 27
|
||||
const UTF8PROC_CATEGORY_CS = 28
|
||||
const UTF8PROC_CATEGORY_CO = 29
|
||||
|
||||
# strings corresponding to the category constants
|
||||
const category_strings = [
|
||||
"Other, not assigned",
|
||||
"Letter, uppercase",
|
||||
"Letter, lowercase",
|
||||
"Letter, titlecase",
|
||||
"Letter, modifier",
|
||||
"Letter, other",
|
||||
"Mark, nonspacing",
|
||||
"Mark, spacing combining",
|
||||
"Mark, enclosing",
|
||||
"Number, decimal digit",
|
||||
"Number, letter",
|
||||
"Number, other",
|
||||
"Punctuation, connector",
|
||||
"Punctuation, dash",
|
||||
"Punctuation, open",
|
||||
"Punctuation, close",
|
||||
"Punctuation, initial quote",
|
||||
"Punctuation, final quote",
|
||||
"Punctuation, other",
|
||||
"Symbol, math",
|
||||
"Symbol, currency",
|
||||
"Symbol, modifier",
|
||||
"Symbol, other",
|
||||
"Separator, space",
|
||||
"Separator, line",
|
||||
"Separator, paragraph",
|
||||
"Other, control",
|
||||
"Other, format",
|
||||
"Other, surrogate",
|
||||
"Other, private use"
|
||||
]
|
||||
|
||||
const UTF8PROC_STABLE = (1<<1)
|
||||
const UTF8PROC_COMPAT = (1<<2)
|
||||
const UTF8PROC_COMPOSE = (1<<3)
|
||||
const UTF8PROC_DECOMPOSE = (1<<4)
|
||||
const UTF8PROC_IGNORE = (1<<5)
|
||||
const UTF8PROC_REJECTNA = (1<<6)
|
||||
const UTF8PROC_NLF2LS = (1<<7)
|
||||
const UTF8PROC_NLF2PS = (1<<8)
|
||||
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
|
||||
const UTF8PROC_STRIPCC = (1<<9)
|
||||
const UTF8PROC_CASEFOLD = (1<<10)
|
||||
const UTF8PROC_CHARBOUND = (1<<11)
|
||||
const UTF8PROC_LUMP = (1<<12)
|
||||
const UTF8PROC_STRIPMARK = (1<<13)
|
||||
|
||||
############################################################################
|
||||
|
||||
utf8proc_error(result) = error(unsafe_string(ccall(:utf8proc_errmsg, Cstring, (Cssize_t,), result)))
|
||||
|
||||
function utf8proc_map(str::String, options::Integer)
|
||||
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
|
||||
str, sizeof(str), C_NULL, 0, options)
|
||||
nwords < 0 && utf8proc_error(nwords)
|
||||
buffer = Base.StringVector(nwords*4)
|
||||
nwords = ccall(:utf8proc_decompose, Int, (Ptr{UInt8}, Int, Ptr{UInt8}, Int, Cint),
|
||||
str, sizeof(str), buffer, nwords, options)
|
||||
nwords < 0 && utf8proc_error(nwords)
|
||||
nbytes = ccall(:utf8proc_reencode, Int, (Ptr{UInt8}, Int, Cint), buffer, nwords, options)
|
||||
nbytes < 0 && utf8proc_error(nbytes)
|
||||
return String(resize!(buffer, nbytes))
|
||||
end
|
||||
|
||||
utf8proc_map(s::AbstractString, flags::Integer) = utf8proc_map(String(s), flags)
|
||||
|
||||
function normalize_string(s::AbstractString; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
|
||||
flags = 0
|
||||
stable && (flags = flags | UTF8PROC_STABLE)
|
||||
compat && (flags = flags | UTF8PROC_COMPAT)
|
||||
if decompose
|
||||
flags = flags | UTF8PROC_DECOMPOSE
|
||||
elseif compose
|
||||
flags = flags | UTF8PROC_COMPOSE
|
||||
elseif compat || stripmark
|
||||
throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
|
||||
end
|
||||
stripignore && (flags = flags | UTF8PROC_IGNORE)
|
||||
rejectna && (flags = flags | UTF8PROC_REJECTNA)
|
||||
newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
|
||||
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
|
||||
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
|
||||
newline2lf && (flags = flags | UTF8PROC_NLF2LF)
|
||||
stripcc && (flags = flags | UTF8PROC_STRIPCC)
|
||||
casefold && (flags = flags | UTF8PROC_CASEFOLD)
|
||||
lump && (flags = flags | UTF8PROC_LUMP)
|
||||
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
|
||||
utf8proc_map(s, flags)
|
||||
end
|
||||
|
||||
"""
|
||||
normalize_string(s::AbstractString, normalform::Symbol)
|
||||
|
||||
Normalize the string `s` according to one of the four "normal forms" of the Unicode
|
||||
standard: `normalform` can be `:NFC`, `:NFD`, `:NFKC`, or `:NFKD`. Normal forms C
|
||||
(canonical composition) and D (canonical decomposition) convert different visually identical
|
||||
representations of the same abstract string into a single canonical form, with form C being
|
||||
more compact. Normal forms KC and KD additionally canonicalize "compatibility equivalents":
|
||||
they convert characters that are abstractly similar but visually distinct into a single
|
||||
canonical choice (e.g. they expand ligatures into the individual characters), with form KC
|
||||
being more compact.
|
||||
|
||||
Alternatively, finer control and additional transformations may be be obtained by calling
|
||||
`normalize_string(s; keywords...)`, where any number of the following boolean keywords
|
||||
options (which all default to `false` except for `compose`) are specified:
|
||||
|
||||
* `compose=false`: do not perform canonical composition
|
||||
* `decompose=true`: do canonical decomposition instead of canonical composition
|
||||
(`compose=true` is ignored if present)
|
||||
* `compat=true`: compatibility equivalents are canonicalized
|
||||
* `casefold=true`: perform Unicode case folding, e.g. for case-insensitive string comparison
|
||||
* `newline2lf=true`, `newline2ls=true`, or `newline2ps=true`: convert various newline
|
||||
sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or
|
||||
paragraph-separation (PS) character, respectively
|
||||
* `stripmark=true`: strip diacritical marks (e.g. accents)
|
||||
* `stripignore=true`: strip Unicode's "default ignorable" characters (e.g. the soft hyphen
|
||||
or the left-to-right marker)
|
||||
* `stripcc=true`: strip control characters; horizontal tabs and form feeds are converted to
|
||||
spaces; newlines are also converted to spaces unless a newline-conversion flag was
|
||||
specified
|
||||
* `rejectna=true`: throw an error if unassigned code points are found
|
||||
* `stable=true`: enforce Unicode Versioning Stability
|
||||
|
||||
For example, NFKC corresponds to the options `compose=true, compat=true, stable=true`.
|
||||
"""
|
||||
function normalize_string(s::AbstractString, nf::Symbol)
|
||||
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
|
||||
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
|
||||
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
|
||||
| UTF8PROC_COMPAT) :
|
||||
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
|
||||
| UTF8PROC_COMPAT) :
|
||||
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
|
||||
end
|
||||
|
||||
############################################################################
|
||||
|
||||
"""
|
||||
charwidth(c)
|
||||
|
||||
Gives the number of columns needed to print a character.
|
||||
"""
|
||||
charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))
|
||||
|
||||
lowercase(c::Char) = isascii(c) ? ('A' <= c <= 'Z' ? c + 0x20 : c) : Char(ccall(:utf8proc_tolower, UInt32, (UInt32,), c))
|
||||
uppercase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_toupper, UInt32, (UInt32,), c))
|
||||
titlecase(c::Char) = isascii(c) ? ('a' <= c <= 'z' ? c - 0x20 : c) : Char(ccall(:utf8proc_totitle, UInt32, (UInt32,), c))
|
||||
|
||||
############################################################################
|
||||
|
||||
# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
|
||||
category_code(c) = ccall(:utf8proc_category, Cint, (UInt32,), c)
|
||||
|
||||
# more human-readable representations of the category code
|
||||
category_abbrev(c) = unsafe_string(ccall(:utf8proc_category_string, Cstring, (UInt32,), c))
|
||||
category_string(c) = category_strings[category_code(c)+1]
|
||||
|
||||
"""
|
||||
is_assigned_char(c) -> Bool
|
||||
|
||||
Returns `true` if the given char or integer is an assigned Unicode code point.
|
||||
"""
|
||||
is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
|
||||
|
||||
## libc character class predicates ##
|
||||
|
||||
"""
|
||||
islower(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is a lowercase letter.
|
||||
A character is classified as lowercase if it belongs to Unicode category Ll,
|
||||
Letter: Lowercase.
|
||||
"""
|
||||
islower(c::Char) = (category_code(c) == UTF8PROC_CATEGORY_LL)
|
||||
|
||||
# true for Unicode upper and mixed case
|
||||
|
||||
"""
|
||||
isupper(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is an uppercase letter.
|
||||
A character is classified as uppercase if it belongs to Unicode category Lu,
|
||||
Letter: Uppercase, or Lt, Letter: Titlecase.
|
||||
"""
|
||||
function isupper(c::Char)
|
||||
ccode = category_code(c)
|
||||
return ccode == UTF8PROC_CATEGORY_LU || ccode == UTF8PROC_CATEGORY_LT
|
||||
end
|
||||
|
||||
"""
|
||||
isdigit(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is a numeric digit (0-9).
|
||||
"""
|
||||
isdigit(c::Char) = ('0' <= c <= '9')
|
||||
|
||||
"""
|
||||
isalpha(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is alphabetic.
|
||||
A character is classified as alphabetic if it belongs to the Unicode general
|
||||
category Letter, i.e. a character whose category code begins with 'L'.
|
||||
"""
|
||||
isalpha(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_LO)
|
||||
|
||||
"""
|
||||
isnumber(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is numeric.
|
||||
A character is classified as numeric if it belongs to the Unicode general category Number,
|
||||
i.e. a character whose category code begins with 'N'.
|
||||
"""
|
||||
isnumber(c::Char) = (UTF8PROC_CATEGORY_ND <= category_code(c) <= UTF8PROC_CATEGORY_NO)
|
||||
|
||||
"""
|
||||
isalnum(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is alphanumeric.
|
||||
A character is classified as alphabetic if it belongs to the Unicode general
|
||||
category Letter or Number, i.e. a character whose category code begins with 'L' or 'N'.
|
||||
"""
|
||||
function isalnum(c::Char)
|
||||
ccode = category_code(c)
|
||||
return (UTF8PROC_CATEGORY_LU <= ccode <= UTF8PROC_CATEGORY_LO) ||
|
||||
(UTF8PROC_CATEGORY_ND <= ccode <= UTF8PROC_CATEGORY_NO)
|
||||
end
|
||||
|
||||
# following C++ only control characters from the Latin-1 subset return true
|
||||
|
||||
"""
|
||||
iscntrl(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is a control character.
|
||||
Control characters are the non-printing characters of the Latin-1 subset of Unicode.
|
||||
"""
|
||||
iscntrl(c::Char) = (c <= Char(0x1f) || Char(0x7f) <= c <= Char(0x9f))
|
||||
|
||||
"""
|
||||
ispunct(c::Char) -> Bool
|
||||
|
||||
Tests whether a character belongs to the Unicode general category Punctuation, i.e. a
|
||||
character whose category code begins with 'P'.
|
||||
"""
|
||||
ispunct(c::Char) = (UTF8PROC_CATEGORY_PC <= category_code(c) <= UTF8PROC_CATEGORY_PO)
|
||||
|
||||
# \u85 is the Unicode Next Line (NEL) character
|
||||
|
||||
"""
|
||||
isspace(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is any whitespace character. Includes ASCII characters '\\t',
|
||||
'\\n', '\\v', '\\f', '\\r', and ' ', Latin-1 character U+0085, and characters in Unicode
|
||||
category Zs.
|
||||
"""
|
||||
@inline isspace(c::Char) = c == ' ' || '\t' <= c <='\r' || c == '\u85' || '\ua0' <= c && category_code(c) == UTF8PROC_CATEGORY_ZS
|
||||
|
||||
"""
|
||||
isprint(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is printable, including spaces, but not a control character.
|
||||
"""
|
||||
isprint(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_ZS)
|
||||
|
||||
# true in principal if a printer would use ink
|
||||
|
||||
"""
|
||||
isgraph(c::Char) -> Bool
|
||||
|
||||
Tests whether a character is printable, and not a space.
|
||||
Any character that would cause a printer to use ink should be
|
||||
classified with `isgraph(c)==true`.
|
||||
"""
|
||||
isgraph(c::Char) = (UTF8PROC_CATEGORY_LU <= category_code(c) <= UTF8PROC_CATEGORY_SO)
|
||||
|
||||
############################################################################
|
||||
# iterators for grapheme segmentation
|
||||
|
||||
isgraphemebreak(c1::Char, c2::Char) =
|
||||
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)
|
||||
|
||||
# Stateful grapheme break required by Unicode-9 rules: the string
|
||||
# must be processed in sequence, with state initialized to Ref{Int32}(0).
|
||||
# Requires utf8proc v2.0 or later.
|
||||
isgraphemebreak!(state::Ref{Int32}, c1::Char, c2::Char) =
|
||||
ccall(:utf8proc_grapheme_break_stateful, Bool, (UInt32, UInt32, Ref{Int32}), c1, c2, state)
|
||||
|
||||
struct GraphemeIterator{S<:AbstractString}
|
||||
s::S # original string (for generation of SubStrings)
|
||||
end
|
||||
|
||||
"""
|
||||
graphemes(s::AbstractString) -> GraphemeIterator
|
||||
|
||||
Returns an iterator over substrings of `s` that correspond to the extended graphemes in the
|
||||
string, as defined by Unicode UAX #29. (Roughly, these are what users would perceive as
|
||||
single characters, even though they may contain more than one codepoint; for example a
|
||||
letter combined with an accent mark is a single grapheme.)
|
||||
"""
|
||||
graphemes(s::AbstractString) = GraphemeIterator{typeof(s)}(s)
|
||||
|
||||
eltype(::Type{GraphemeIterator{S}}) where {S} = SubString{S}
|
||||
eltype(::Type{GraphemeIterator{SubString{S}}}) where {S} = SubString{S}
|
||||
|
||||
function length(g::GraphemeIterator)
|
||||
c0 = Char(0x00ad) # soft hyphen (grapheme break always allowed after this)
|
||||
n = 0
|
||||
state = Ref{Int32}(0)
|
||||
for c in g.s
|
||||
n += isgraphemebreak!(state, c0, c)
|
||||
c0 = c
|
||||
end
|
||||
return n
|
||||
end
|
||||
|
||||
start(g::GraphemeIterator) = (start(g.s), Ref{Int32}(0))
|
||||
done(g::GraphemeIterator, i) = done(g.s, i[1])
|
||||
|
||||
function next(g::GraphemeIterator, i_)
|
||||
s = g.s
|
||||
i, state = i_
|
||||
j = i
|
||||
c0, k = next(s, i)
|
||||
while !done(s, k) # loop until next grapheme is s[i:j]
|
||||
c, ℓ = next(s, k)
|
||||
isgraphemebreak!(state, c0, c) && break
|
||||
j = k
|
||||
k = ℓ
|
||||
c0 = c
|
||||
end
|
||||
return (SubString(s, i, j), (k, state))
|
||||
end
|
||||
|
||||
==(g1::GraphemeIterator, g2::GraphemeIterator) = g1.s == g2.s
|
||||
hash(g::GraphemeIterator, h::UInt) = hash(g.s, h)
|
||||
isless(g1::GraphemeIterator, g2::GraphemeIterator) = isless(g1.s, g2.s)
|
||||
|
||||
convert(::Type{S}, g::GraphemeIterator) where {S<:AbstractString} = convert(S, g.s)
|
||||
|
||||
show(io::IO, g::GraphemeIterator{S}) where {S} = print(io, "length-$(length(g)) GraphemeIterator{$S} for \"$(g.s)\"")
|
||||
|
||||
############################################################################
|
||||
|
||||
end # module
|
||||
500
julia-0.6.2/share/julia/base/strings/util.jl
Normal file
500
julia-0.6.2/share/julia/base/strings/util.jl
Normal file
@@ -0,0 +1,500 @@
|
||||
# This file is a part of Julia. License is MIT: https://julialang.org/license
|
||||
|
||||
# starts with and ends with predicates
|
||||
|
||||
"""
|
||||
startswith(s::AbstractString, prefix::AbstractString)
|
||||
|
||||
Returns `true` if `s` starts with `prefix`. If `prefix` is a vector or set
|
||||
of characters, tests whether the first character of `s` belongs to that set.
|
||||
|
||||
See also [`endswith`](@ref).
|
||||
|
||||
```jldoctest
|
||||
julia> startswith("JuliaLang", "Julia")
|
||||
true
|
||||
```
|
||||
"""
|
||||
function startswith(a::AbstractString, b::AbstractString)
|
||||
i = start(a)
|
||||
j = start(b)
|
||||
while !done(a,i) && !done(b,i)
|
||||
c, i = next(a,i)
|
||||
d, j = next(b,j)
|
||||
(c != d) && (return false)
|
||||
end
|
||||
done(b,i)
|
||||
end
|
||||
startswith(str::AbstractString, chars::Chars) = !isempty(str) && first(str) in chars
|
||||
|
||||
"""
|
||||
endswith(s::AbstractString, suffix::AbstractString)
|
||||
|
||||
Returns `true` if `s` ends with `suffix`. If `suffix` is a vector or set of
|
||||
characters, tests whether the last character of `s` belongs to that set.
|
||||
|
||||
See also [`startswith`](@ref).
|
||||
|
||||
```jldoctest
|
||||
julia> endswith("Sunday", "day")
|
||||
true
|
||||
```
|
||||
"""
|
||||
function endswith(a::AbstractString, b::AbstractString)
|
||||
i = endof(a)
|
||||
j = endof(b)
|
||||
a1 = start(a)
|
||||
b1 = start(b)
|
||||
while a1 <= i && b1 <= j
|
||||
c = a[i]
|
||||
d = b[j]
|
||||
(c != d) && (return false)
|
||||
i = prevind(a,i)
|
||||
j = prevind(b,j)
|
||||
end
|
||||
j < b1
|
||||
end
|
||||
endswith(str::AbstractString, chars::Chars) = !isempty(str) && last(str) in chars
|
||||
|
||||
startswith(a::String, b::String) =
|
||||
(a.len >= b.len && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, b.len) == 0)
|
||||
startswith(a::Vector{UInt8}, b::Vector{UInt8}) =
|
||||
(length(a) >= length(b) && ccall(:memcmp, Int32, (Ptr{UInt8}, Ptr{UInt8}, UInt), a, b, length(b)) == 0)
|
||||
|
||||
# TODO: fast endswith
|
||||
|
||||
"""
|
||||
chop(s::AbstractString)
|
||||
|
||||
Remove the last character from `s`.
|
||||
|
||||
```jldoctest
|
||||
julia> a = "March"
|
||||
"March"
|
||||
|
||||
julia> chop(a)
|
||||
"Marc"
|
||||
```
|
||||
"""
|
||||
chop(s::AbstractString) = SubString(s, 1, endof(s)-1)
|
||||
|
||||
"""
|
||||
chomp(s::AbstractString)
|
||||
|
||||
Remove a single trailing newline from a string.
|
||||
|
||||
```jldoctest
|
||||
julia> chomp("Hello\\n")
|
||||
"Hello"
|
||||
```
|
||||
"""
|
||||
function chomp(s::AbstractString)
|
||||
i = endof(s)
|
||||
(i < 1 || s[i] != '\n') && (return SubString(s, 1, i))
|
||||
j = prevind(s,i)
|
||||
(j < 1 || s[j] != '\r') && (return SubString(s, 1, i-1))
|
||||
return SubString(s, 1, j-1)
|
||||
end
|
||||
function chomp(s::String)
|
||||
i = endof(s)
|
||||
if i < 1 || codeunit(s,i) != 0x0a
|
||||
SubString(s, 1, i)
|
||||
elseif i < 2 || codeunit(s,i-1) != 0x0d
|
||||
SubString(s, 1, i-1)
|
||||
else
|
||||
SubString(s, 1, i-2)
|
||||
end
|
||||
end
|
||||
|
||||
# NOTE: use with caution -- breaks the immutable string convention!
|
||||
# TODO: this is hard to provide with the new representation
|
||||
#function chomp!(s::String)
|
||||
# if !isempty(s) && codeunit(s,s.len) == 0x0a
|
||||
# n = (endof(s) < 2 || s.data[end-1] != 0x0d) ? 1 : 2
|
||||
# ccall(:jl_array_del_end, Void, (Any, UInt), s.data, n)
|
||||
# end
|
||||
# return s
|
||||
#end
|
||||
chomp!(s::AbstractString) = chomp(s) # copying fallback for other string types
|
||||
|
||||
const _default_delims = [' ','\t','\n','\v','\f','\r']
|
||||
|
||||
"""
|
||||
lstrip(s::AbstractString[, chars::Chars])
|
||||
|
||||
Return `s` with any leading whitespace and delimiters removed.
|
||||
The default delimiters to remove are `' '`, `\\t`, `\\n`, `\\v`,
|
||||
`\\f`, and `\\r`.
|
||||
If `chars` (a character, or vector or set of characters) is provided,
|
||||
instead remove characters contained in it.
|
||||
|
||||
```jldoctest
|
||||
julia> a = lpad("March", 20)
|
||||
" March"
|
||||
|
||||
julia> lstrip(a)
|
||||
"March"
|
||||
```
|
||||
"""
|
||||
function lstrip(s::AbstractString, chars::Chars=_default_delims)
|
||||
i = start(s)
|
||||
while !done(s,i)
|
||||
c, j = next(s,i)
|
||||
if !(c in chars)
|
||||
return s[i:end]
|
||||
end
|
||||
i = j
|
||||
end
|
||||
s[end+1:end]
|
||||
end
|
||||
|
||||
"""
|
||||
rstrip(s::AbstractString[, chars::Chars])
|
||||
|
||||
Return `s` with any trailing whitespace and delimiters removed.
|
||||
The default delimiters to remove are `' '`, `\\t`, `\\n`, `\\v`,
|
||||
`\\f`, and `\\r`.
|
||||
If `chars` (a character, or vector or set of characters) is provided,
|
||||
instead remove characters contained in it.
|
||||
|
||||
```jldoctest
|
||||
julia> a = rpad("March", 20)
|
||||
"March "
|
||||
|
||||
julia> rstrip(a)
|
||||
"March"
|
||||
```
|
||||
"""
|
||||
function rstrip(s::AbstractString, chars::Chars=_default_delims)
|
||||
r = RevString(s)
|
||||
i = start(r)
|
||||
while !done(r,i)
|
||||
c, j = next(r,i)
|
||||
if !(c in chars)
|
||||
return s[1:end-i+1]
|
||||
end
|
||||
i = j
|
||||
end
|
||||
s[1:0]
|
||||
end
|
||||
|
||||
"""
|
||||
strip(s::AbstractString, [chars::Chars])
|
||||
|
||||
Return `s` with any leading and trailing whitespace removed.
|
||||
If `chars` (a character, or vector or set of characters) is provided,
|
||||
instead remove characters contained in it.
|
||||
|
||||
```jldoctest
|
||||
julia> strip("{3, 5}\\n", ['{', '}', '\\n'])
|
||||
"3, 5"
|
||||
```
|
||||
"""
|
||||
strip(s::AbstractString) = lstrip(rstrip(s))
|
||||
strip(s::AbstractString, chars::Chars) = lstrip(rstrip(s, chars), chars)
|
||||
|
||||
## string padding functions ##
|
||||
|
||||
function lpad(s::AbstractString, n::Integer, p::AbstractString=" ")
|
||||
m = n - strwidth(s)
|
||||
(m <= 0) && (return s)
|
||||
l = strwidth(p)
|
||||
if l==1
|
||||
return string(p^m, s)
|
||||
end
|
||||
q = div(m,l)
|
||||
r = m - q*l
|
||||
i = r != 0 ? chr2ind(p, r) : -1
|
||||
string(p^q, p[1:i], s)
|
||||
end
|
||||
|
||||
function rpad(s::AbstractString, n::Integer, p::AbstractString=" ")
|
||||
m = n - strwidth(s)
|
||||
(m <= 0) && (return s)
|
||||
l = strwidth(p)
|
||||
if l==1
|
||||
return string(s, p^m)
|
||||
end
|
||||
q = div(m,l)
|
||||
r = m - q*l
|
||||
i = r != 0 ? chr2ind(p, r) : -1
|
||||
string(s, p^q, p[1:i])
|
||||
end
|
||||
|
||||
"""
|
||||
lpad(s, n::Integer, p::AbstractString=" ")
|
||||
|
||||
Make a string at least `n` columns wide when printed by padding `s` on the left
|
||||
with copies of `p`.
|
||||
|
||||
```jldoctest
|
||||
julia> lpad("March",10)
|
||||
" March"
|
||||
```
|
||||
"""
|
||||
lpad(s, n::Integer, p=" ") = lpad(string(s),n,string(p))
|
||||
|
||||
"""
|
||||
rpad(s, n::Integer, p::AbstractString=" ")
|
||||
|
||||
Make a string at least `n` columns wide when printed by padding `s` on the right
|
||||
with copies of `p`.
|
||||
|
||||
```jldoctest
|
||||
julia> rpad("March",20)
|
||||
"March "
|
||||
```
|
||||
"""
|
||||
rpad(s, n::Integer, p=" ") = rpad(string(s),n,string(p))
|
||||
cpad(s, n::Integer, p=" ") = rpad(lpad(s,div(n+strwidth(s),2),p),n,p)
|
||||
|
||||
# splitter can be a Char, Vector{Char}, AbstractString, Regex, ...
|
||||
# any splitter that provides search(s::AbstractString, splitter)
|
||||
split(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:SubString} =
|
||||
_split(str, splitter, limit, keep, T[])
|
||||
|
||||
"""
|
||||
split(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true)
|
||||
|
||||
Return an array of substrings by splitting the given string on occurrences of the given
|
||||
character delimiters, which may be specified in any of the formats allowed by `search`'s
|
||||
second argument (i.e. a single character, collection of characters, string, or regular
|
||||
expression). If `chars` is omitted, it defaults to the set of all space characters, and
|
||||
`keep` is taken to be `false`. The two keyword arguments are optional: they are a
|
||||
maximum size for the result and a flag determining whether empty fields should be kept in
|
||||
the result.
|
||||
|
||||
```jldoctest
|
||||
julia> a = "Ma.rch"
|
||||
"Ma.rch"
|
||||
|
||||
julia> split(a,".")
|
||||
2-element Array{SubString{String},1}:
|
||||
"Ma"
|
||||
"rch"
|
||||
```
|
||||
"""
|
||||
split(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:AbstractString} =
|
||||
_split(str, splitter, limit, keep, SubString{T}[])
|
||||
function _split(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
|
||||
i = start(str)
|
||||
n = endof(str)
|
||||
r = search(str,splitter,i)
|
||||
j, k = first(r), nextind(str,last(r))
|
||||
while 0 < j <= n && length(strs) != limit-1
|
||||
if i < k
|
||||
if keep_empty || i < j
|
||||
push!(strs, SubString(str,i,prevind(str,j)))
|
||||
end
|
||||
i = k
|
||||
end
|
||||
(k <= j) && (k = nextind(str,j))
|
||||
r = search(str,splitter,k)
|
||||
j, k = first(r), nextind(str,last(r))
|
||||
end
|
||||
if keep_empty || !done(str,i)
|
||||
push!(strs, SubString(str,i))
|
||||
end
|
||||
return strs
|
||||
end
|
||||
|
||||
# a bit oddball, but standard behavior in Perl, Ruby & Python:
|
||||
split(str::AbstractString) = split(str, _default_delims; limit=0, keep=false)
|
||||
|
||||
rsplit(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:SubString} =
|
||||
_rsplit(str, splitter, limit, keep, T[])
|
||||
|
||||
"""
|
||||
rsplit(s::AbstractString, [chars]; limit::Integer=0, keep::Bool=true)
|
||||
|
||||
Similar to [`split`](@ref), but starting from the end of the string.
|
||||
|
||||
```jldoctest
|
||||
julia> a = "M.a.r.c.h"
|
||||
"M.a.r.c.h"
|
||||
|
||||
julia> rsplit(a,".")
|
||||
5-element Array{SubString{String},1}:
|
||||
"M"
|
||||
"a"
|
||||
"r"
|
||||
"c"
|
||||
"h"
|
||||
|
||||
julia> rsplit(a,".";limit=1)
|
||||
1-element Array{SubString{String},1}:
|
||||
"M.a.r.c.h"
|
||||
|
||||
julia> rsplit(a,".";limit=2)
|
||||
2-element Array{SubString{String},1}:
|
||||
"M.a.r.c"
|
||||
"h"
|
||||
```
|
||||
"""
|
||||
rsplit(str::T, splitter; limit::Integer=0, keep::Bool=true) where {T<:AbstractString} =
|
||||
_rsplit(str, splitter, limit, keep, SubString{T}[])
|
||||
function _rsplit(str::AbstractString, splitter, limit::Integer, keep_empty::Bool, strs::Array)
|
||||
i = start(str)
|
||||
n = endof(str)
|
||||
r = rsearch(str,splitter)
|
||||
j = first(r)-1
|
||||
k = last(r)
|
||||
while((0 <= j < n) && (length(strs) != limit-1))
|
||||
if i <= k
|
||||
(keep_empty || (k < n)) && unshift!(strs, SubString(str,k+1,n))
|
||||
n = j
|
||||
end
|
||||
(k <= j) && (j = prevind(str,j))
|
||||
r = rsearch(str,splitter,j)
|
||||
j = first(r)-1
|
||||
k = last(r)
|
||||
end
|
||||
(keep_empty || (n > 0)) && unshift!(strs, SubString(str,1,n))
|
||||
return strs
|
||||
end
|
||||
#rsplit(str::AbstractString) = rsplit(str, _default_delims, 0, false)
|
||||
|
||||
_replace(io, repl, str, r, pattern) = print(io, repl)
|
||||
_replace(io, repl::Function, str, r, pattern) =
|
||||
print(io, repl(SubString(str, first(r), last(r))))
|
||||
|
||||
function replace(str::String, pattern, repl, limit::Integer)
|
||||
n = 1
|
||||
e = endof(str)
|
||||
i = a = start(str)
|
||||
r = search(str,pattern,i)
|
||||
j, k = first(r), last(r)
|
||||
out = IOBuffer(StringVector(floor(Int, 1.2sizeof(str))), true, true)
|
||||
out.size = 0
|
||||
out.ptr = 1
|
||||
while j != 0
|
||||
if i == a || i <= k
|
||||
unsafe_write(out, pointer(str, i), UInt(j-i))
|
||||
_replace(out, repl, str, r, pattern)
|
||||
end
|
||||
if k<j
|
||||
i = j
|
||||
k = nextind(str, j)
|
||||
else
|
||||
i = k = nextind(str, k)
|
||||
end
|
||||
if j > e
|
||||
break
|
||||
end
|
||||
r = search(str,pattern,k)
|
||||
j, k = first(r), last(r)
|
||||
n == limit && break
|
||||
n += 1
|
||||
end
|
||||
write(out, SubString(str,i))
|
||||
String(take!(out))
|
||||
end
|
||||
|
||||
"""
|
||||
replace(string::AbstractString, pat, r[, n::Integer=0])
|
||||
|
||||
Search for the given pattern `pat`, and replace each occurrence with `r`. If `n` is
|
||||
provided, replace at most `n` occurrences. As with search, the second argument may be a
|
||||
single character, a vector or a set of characters, a string, or a regular expression. If `r`
|
||||
is a function, each occurrence is replaced with `r(s)` where `s` is the matched substring.
|
||||
If `pat` is a regular expression and `r` is a `SubstitutionString`, then capture group
|
||||
references in `r` are replaced with the corresponding matched text.
|
||||
"""
|
||||
replace(s::AbstractString, pat, f, n::Integer) = replace(String(s), pat, f, n)
|
||||
replace(s::AbstractString, pat, r) = replace(s, pat, r, 0)
|
||||
|
||||
# hex <-> bytes conversion
|
||||
|
||||
"""
|
||||
hex2bytes(s::AbstractString)
|
||||
|
||||
Convert an arbitrarily long hexadecimal string to its binary representation. Returns an
|
||||
`Array{UInt8,1}`, i.e. an array of bytes.
|
||||
|
||||
```jldoctest
|
||||
julia> a = hex(12345)
|
||||
"3039"
|
||||
|
||||
julia> hex2bytes(a)
|
||||
2-element Array{UInt8,1}:
|
||||
0x30
|
||||
0x39
|
||||
```
|
||||
"""
|
||||
function hex2bytes(s::AbstractString)
|
||||
a = zeros(UInt8, div(endof(s), 2))
|
||||
i, j = start(s), 0
|
||||
while !done(s, i)
|
||||
c, i = next(s, i)
|
||||
n = '0' <= c <= '9' ? c - '0' :
|
||||
'a' <= c <= 'f' ? c - 'a' + 10 :
|
||||
'A' <= c <= 'F' ? c - 'A' + 10 :
|
||||
throw(ArgumentError("not a hexadecimal string: $(repr(s))"))
|
||||
done(s, i) &&
|
||||
throw(ArgumentError("string length must be even: length($(repr(s))) == $(length(s))"))
|
||||
c, i = next(s, i)
|
||||
n = '0' <= c <= '9' ? n << 4 + c - '0' :
|
||||
'a' <= c <= 'f' ? n << 4 + c - 'a' + 10 :
|
||||
'A' <= c <= 'F' ? n << 4 + c - 'A' + 10 :
|
||||
throw(ArgumentError("not a hexadecimal string: $(repr(s))"))
|
||||
a[j += 1] = n
|
||||
end
|
||||
resize!(a, j)
|
||||
return a
|
||||
end
|
||||
|
||||
"""
|
||||
bytes2hex(bin_arr::Array{UInt8, 1}) -> String
|
||||
|
||||
Convert an array of bytes to its hexadecimal representation.
|
||||
All characters are in lower-case.
|
||||
|
||||
```jldoctest
|
||||
julia> a = hex(12345)
|
||||
"3039"
|
||||
|
||||
julia> b = hex2bytes(a)
|
||||
2-element Array{UInt8,1}:
|
||||
0x30
|
||||
0x39
|
||||
|
||||
julia> bytes2hex(b)
|
||||
"3039"
|
||||
```
|
||||
"""
|
||||
function bytes2hex(a::AbstractArray{UInt8})
|
||||
b = Vector{UInt8}(2*length(a))
|
||||
i = 0
|
||||
for x in a
|
||||
b[i += 1] = hex_chars[1 + x >> 4]
|
||||
b[i += 1] = hex_chars[1 + x & 0xf]
|
||||
end
|
||||
return String(b)
|
||||
end
|
||||
|
||||
# check for pure ASCII-ness
|
||||
|
||||
function ascii(s::String)
|
||||
for (i, b) in enumerate(Vector{UInt8}(s))
|
||||
b < 0x80 || throw(ArgumentError("invalid ASCII at index $i in $(repr(s))"))
|
||||
end
|
||||
return s
|
||||
end
|
||||
|
||||
"""
|
||||
ascii(s::AbstractString)
|
||||
|
||||
Convert a string to `String` type and check that it contains only ASCII data, otherwise
|
||||
throwing an `ArgumentError` indicating the position of the first non-ASCII byte.
|
||||
|
||||
```jldoctest
|
||||
julia> ascii("abcdeγfgh")
|
||||
ERROR: ArgumentError: invalid ASCII at index 6 in "abcdeγfgh"
|
||||
Stacktrace:
|
||||
[1] ascii(::String) at ./strings/util.jl:479
|
||||
|
||||
julia> ascii("abcdefgh")
|
||||
"abcdefgh"
|
||||
```
|
||||
"""
|
||||
ascii(x::AbstractString) = ascii(convert(String, x))
|
||||
Reference in New Issue
Block a user