in src/arraytypes/dictencoding.jl [200:340]
function arrowvector(
::DictEncodedKind,
x,
i,
nl,
fi,
de,
ded,
meta;
dictencode::Bool=false,
dictencodenested::Bool=false,
kw...,
)
@assert x isa DictEncode
id = x.id == -1 ? dictencodeid(i, nl, fi) : x.id
x = x.data
len = length(x)
validity = ValidityBitmap(x)
if !haskey(de, id)
if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing
x = PooledArray(x; signed=true, compress=true)
inds = DataAPI.refarray(x)
pool = DataAPI.refpool(x)
else
pool = DataAPI.refpool(x)
refa = DataAPI.refarray(x)
inds = copyto!(similar(Vector{signedtype(length(pool))}, length(refa)), refa)
end
if typeof(pool).name.name == :CategoricalRefPool
if eltype(x) >: Missing
pool = vcat(missing, DataAPI.levels(x))
else
pool = DataAPI.levels(x)
for i = 1:length(inds)
@inbounds inds[i] -= 1
end
end
else
for i = 1:length(inds)
@inbounds inds[i] -= 1
end
end
data = arrowvector(
pool,
i,
nl,
fi,
de,
ded,
nothing;
dictencode=dictencodenested,
dictencodenested=dictencodenested,
dictencoding=true,
kw...,
)
encoding = DictEncoding{eltype(data),eltype(inds),typeof(data)}(
id,
data,
false,
getmetadata(data),
)
de[id] = Lockable(encoding)
else
encodinglockable = de[id]
Base.@lock encodinglockable begin
encoding = encodinglockable.value
len = length(x)
ET = indextype(encoding)
pool = Dict{Union{eltype(encoding),eltype(x)},ET}(
a => (b - 1) for (b, a) in enumerate(encoding)
)
deltas = eltype(x)[]
inds = Vector{ET}(undef, len)
categorical = typeof(x).name.name == :CategoricalArray
for (j, val) in enumerate(x)
if categorical
val = get(val)
end
@inbounds inds[j] = get!(pool, val) do
push!(deltas, val)
return length(pool)
end
end
if !isempty(deltas)
if length(deltas) + length(encoding) > typemax(ET)
error(
"fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET",
)
end
data = arrowvector(
deltas,
i,
nl,
fi,
de,
ded,
nothing;
dictencode=dictencodenested,
dictencodenested=dictencodenested,
dictencoding=true,
kw...,
)
push!(
ded,
DictEncoding{eltype(data),ET,typeof(data)}(
id,
data,
false,
getmetadata(data),
),
)
if typeof(encoding.data) <: ChainedVector
append!(encoding.data, data)
else
data2 = ChainedVector([encoding.data, data])
encoding = DictEncoding{eltype(data2),ET,typeof(data2)}(
id,
data2,
false,
getmetadata(encoding),
)
de[id] = Lockable(encoding)
end
end
end
end
if meta !== nothing && getmetadata(encoding) !== nothing
meta = toidict(merge!(Dict(meta), Dict(getmetadata(encoding))))
elseif getmetadata(encoding) !== nothing
meta = getmetadata(encoding)
end
return DictEncoded(UInt8[], validity, inds, encoding, meta)
end