arrow_cast/cast/
dictionary.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::cast::*;
19
20/// Attempts to cast an `ArrayDictionary` with index type K into
21/// `to_type` for supported types.
22///
23/// K is the key type
24pub(crate) fn dictionary_cast<K: ArrowDictionaryKeyType>(
25    array: &dyn Array,
26    to_type: &DataType,
27    cast_options: &CastOptions,
28) -> Result<ArrayRef, ArrowError> {
29    use DataType::*;
30
31    match to_type {
32        Dictionary(to_index_type, to_value_type) => {
33            let dict_array = array
34                .as_any()
35                .downcast_ref::<DictionaryArray<K>>()
36                .ok_or_else(|| {
37                    ArrowError::ComputeError(
38                        "Internal Error: Cannot cast dictionary to DictionaryArray of expected type".to_string(),
39                    )
40                })?;
41
42            let keys_array: ArrayRef =
43                Arc::new(PrimitiveArray::<K>::from(dict_array.keys().to_data()));
44            let values_array = dict_array.values();
45            let cast_keys = cast_with_options(&keys_array, to_index_type, cast_options)?;
46            let cast_values = cast_with_options(values_array, to_value_type, cast_options)?;
47
48            // Failure to cast keys (because they don't fit in the
49            // target type) results in NULL values;
50            if cast_keys.null_count() > keys_array.null_count() {
51                return Err(ArrowError::ComputeError(format!(
52                    "Could not convert {} dictionary indexes from {:?} to {:?}",
53                    cast_keys.null_count() - keys_array.null_count(),
54                    keys_array.data_type(),
55                    to_index_type
56                )));
57            }
58
59            let data = cast_keys.into_data();
60            let builder = data
61                .into_builder()
62                .data_type(to_type.clone())
63                .child_data(vec![cast_values.into_data()]);
64
65            // Safety
66            // Cast keys are still valid
67            let data = unsafe { builder.build_unchecked() };
68
69            // create the appropriate array type
70            let new_array: ArrayRef = match **to_index_type {
71                Int8 => Arc::new(DictionaryArray::<Int8Type>::from(data)),
72                Int16 => Arc::new(DictionaryArray::<Int16Type>::from(data)),
73                Int32 => Arc::new(DictionaryArray::<Int32Type>::from(data)),
74                Int64 => Arc::new(DictionaryArray::<Int64Type>::from(data)),
75                UInt8 => Arc::new(DictionaryArray::<UInt8Type>::from(data)),
76                UInt16 => Arc::new(DictionaryArray::<UInt16Type>::from(data)),
77                UInt32 => Arc::new(DictionaryArray::<UInt32Type>::from(data)),
78                UInt64 => Arc::new(DictionaryArray::<UInt64Type>::from(data)),
79                _ => {
80                    return Err(ArrowError::CastError(format!(
81                        "Unsupported type {to_index_type:?} for dictionary index"
82                    )));
83                }
84            };
85
86            Ok(new_array)
87        }
88        Utf8View => {
89            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
90            // we handle it here to avoid the copy.
91            let dict_array = array
92                .as_dictionary::<K>()
93                .downcast_dict::<StringArray>()
94                .ok_or_else(|| {
95                    ArrowError::ComputeError(
96                        "Internal Error: Cannot cast Utf8View to StringArray of expected type"
97                            .to_string(),
98                    )
99                })?;
100
101            let string_view = view_from_dict_values::<K, StringViewType, GenericStringType<i32>>(
102                dict_array.values(),
103                dict_array.keys(),
104            )?;
105            Ok(Arc::new(string_view))
106        }
107        BinaryView => {
108            // `unpack_dictionary` can handle Utf8View/BinaryView types, but incurs unnecessary data copy of the value buffer.
109            // we handle it here to avoid the copy.
110            let dict_array = array
111                .as_dictionary::<K>()
112                .downcast_dict::<BinaryArray>()
113                .ok_or_else(|| {
114                    ArrowError::ComputeError(
115                        "Internal Error: Cannot cast BinaryView to BinaryArray of expected type"
116                            .to_string(),
117                    )
118                })?;
119
120            let binary_view = view_from_dict_values::<K, BinaryViewType, BinaryType>(
121                dict_array.values(),
122                dict_array.keys(),
123            )?;
124            Ok(Arc::new(binary_view))
125        }
126        _ => unpack_dictionary::<K>(array, to_type, cast_options),
127    }
128}
129
130fn view_from_dict_values<K: ArrowDictionaryKeyType, T: ByteViewType, V: ByteArrayType>(
131    array: &GenericByteArray<V>,
132    keys: &PrimitiveArray<K>,
133) -> Result<GenericByteViewArray<T>, ArrowError> {
134    let value_buffer = array.values();
135    let value_offsets = array.value_offsets();
136    let mut builder = GenericByteViewBuilder::<T>::with_capacity(keys.len());
137    builder.append_block(value_buffer.clone());
138    for i in keys.iter() {
139        match i {
140            Some(v) => {
141                let idx = v.to_usize().ok_or_else(|| {
142                    ArrowError::ComputeError("Invalid dictionary index".to_string())
143                })?;
144
145                // Safety
146                // (1) The index is within bounds as they are offsets
147                // (2) The append_view is safe
148                unsafe {
149                    let offset = value_offsets.get_unchecked(idx).as_usize();
150                    let end = value_offsets.get_unchecked(idx + 1).as_usize();
151                    let length = end - offset;
152                    builder.append_view_unchecked(0, offset as u32, length as u32)
153                }
154            }
155            None => {
156                builder.append_null();
157            }
158        }
159    }
160    Ok(builder.finish())
161}
162
163// Unpack a dictionary where the keys are of type <K> into a flattened array of type to_type
164pub(crate) fn unpack_dictionary<K>(
165    array: &dyn Array,
166    to_type: &DataType,
167    cast_options: &CastOptions,
168) -> Result<ArrayRef, ArrowError>
169where
170    K: ArrowDictionaryKeyType,
171{
172    let dict_array = array.as_dictionary::<K>();
173    let cast_dict_values = cast_with_options(dict_array.values(), to_type, cast_options)?;
174    take(cast_dict_values.as_ref(), dict_array.keys(), None)
175}
176
177/// Pack a data type into a dictionary array passing the values through a primitive array
178pub(crate) fn pack_array_to_dictionary_via_primitive<K: ArrowDictionaryKeyType>(
179    array: &dyn Array,
180    primitive_type: DataType,
181    dict_value_type: &DataType,
182    cast_options: &CastOptions,
183) -> Result<ArrayRef, ArrowError> {
184    let primitive = cast_with_options(array, &primitive_type, cast_options)?;
185    let dict = cast_with_options(
186        primitive.as_ref(),
187        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(primitive_type)),
188        cast_options,
189    )?;
190    cast_with_options(
191        dict.as_ref(),
192        &DataType::Dictionary(Box::new(K::DATA_TYPE), Box::new(dict_value_type.clone())),
193        cast_options,
194    )
195}
196
197/// Attempts to encode an array into an `ArrayDictionary` with index
198/// type K and value (dictionary) type value_type
199///
200/// K is the key type
201pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
202    array: &dyn Array,
203    dict_value_type: &DataType,
204    cast_options: &CastOptions,
205) -> Result<ArrayRef, ArrowError> {
206    use DataType::*;
207
208    match *dict_value_type {
209        Int8 => pack_numeric_to_dictionary::<K, Int8Type>(array, dict_value_type, cast_options),
210        Int16 => pack_numeric_to_dictionary::<K, Int16Type>(array, dict_value_type, cast_options),
211        Int32 => pack_numeric_to_dictionary::<K, Int32Type>(array, dict_value_type, cast_options),
212        Int64 => pack_numeric_to_dictionary::<K, Int64Type>(array, dict_value_type, cast_options),
213        UInt8 => pack_numeric_to_dictionary::<K, UInt8Type>(array, dict_value_type, cast_options),
214        UInt16 => pack_numeric_to_dictionary::<K, UInt16Type>(array, dict_value_type, cast_options),
215        UInt32 => pack_numeric_to_dictionary::<K, UInt32Type>(array, dict_value_type, cast_options),
216        UInt64 => pack_numeric_to_dictionary::<K, UInt64Type>(array, dict_value_type, cast_options),
217        Decimal128(p, s) => pack_decimal_to_dictionary::<K, Decimal128Type>(
218            array,
219            dict_value_type,
220            p,
221            s,
222            cast_options,
223        ),
224        Decimal256(p, s) => pack_decimal_to_dictionary::<K, Decimal256Type>(
225            array,
226            dict_value_type,
227            p,
228            s,
229            cast_options,
230        ),
231        Float16 => {
232            pack_numeric_to_dictionary::<K, Float16Type>(array, dict_value_type, cast_options)
233        }
234        Float32 => {
235            pack_numeric_to_dictionary::<K, Float32Type>(array, dict_value_type, cast_options)
236        }
237        Float64 => {
238            pack_numeric_to_dictionary::<K, Float64Type>(array, dict_value_type, cast_options)
239        }
240        Date32 => pack_array_to_dictionary_via_primitive::<K>(
241            array,
242            DataType::Int32,
243            dict_value_type,
244            cast_options,
245        ),
246        Date64 => pack_array_to_dictionary_via_primitive::<K>(
247            array,
248            DataType::Int64,
249            dict_value_type,
250            cast_options,
251        ),
252        Time32(_) => pack_array_to_dictionary_via_primitive::<K>(
253            array,
254            DataType::Int32,
255            dict_value_type,
256            cast_options,
257        ),
258        Time64(_) => pack_array_to_dictionary_via_primitive::<K>(
259            array,
260            DataType::Int64,
261            dict_value_type,
262            cast_options,
263        ),
264        Timestamp(_, _) => pack_array_to_dictionary_via_primitive::<K>(
265            array,
266            DataType::Int64,
267            dict_value_type,
268            cast_options,
269        ),
270        Utf8 => {
271            // If the input is a view type, we can avoid casting (thus copying) the data
272            if array.data_type() == &DataType::Utf8View {
273                return string_view_to_dictionary::<K, i32>(array);
274            }
275            pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
276        }
277        LargeUtf8 => {
278            // If the input is a view type, we can avoid casting (thus copying) the data
279            if array.data_type() == &DataType::Utf8View {
280                return string_view_to_dictionary::<K, i64>(array);
281            }
282            pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
283        }
284        Binary => {
285            // If the input is a view type, we can avoid casting (thus copying) the data
286            if array.data_type() == &DataType::BinaryView {
287                return binary_view_to_dictionary::<K, i32>(array);
288            }
289            pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
290        }
291        LargeBinary => {
292            // If the input is a view type, we can avoid casting (thus copying) the data
293            if array.data_type() == &DataType::BinaryView {
294                return binary_view_to_dictionary::<K, i64>(array);
295            }
296            pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
297        }
298        _ => Err(ArrowError::CastError(format!(
299            "Unsupported output type for dictionary packing: {dict_value_type:?}"
300        ))),
301    }
302}
303
304// Packs the data from the primitive array of type <V> to a
305// DictionaryArray with keys of type K and values of value_type V
306pub(crate) fn pack_numeric_to_dictionary<K, V>(
307    array: &dyn Array,
308    dict_value_type: &DataType,
309    cast_options: &CastOptions,
310) -> Result<ArrayRef, ArrowError>
311where
312    K: ArrowDictionaryKeyType,
313    V: ArrowPrimitiveType,
314{
315    // attempt to cast the source array values to the target value type (the dictionary values type)
316    let cast_values = cast_with_options(array, dict_value_type, cast_options)?;
317    let values = cast_values.as_primitive::<V>();
318
319    let mut b = PrimitiveDictionaryBuilder::<K, V>::with_capacity(values.len(), values.len());
320
321    // copy each element one at a time
322    for i in 0..values.len() {
323        if values.is_null(i) {
324            b.append_null();
325        } else {
326            b.append(values.value(i))?;
327        }
328    }
329    Ok(Arc::new(b.finish()))
330}
331
332pub(crate) fn pack_decimal_to_dictionary<K, D>(
333    array: &dyn Array,
334    dict_value_type: &DataType,
335    precision: u8,
336    scale: i8,
337    cast_options: &CastOptions,
338) -> Result<ArrayRef, ArrowError>
339where
340    K: ArrowDictionaryKeyType,
341    D: DecimalType + ArrowPrimitiveType,
342{
343    let dict = pack_numeric_to_dictionary::<K, D>(array, dict_value_type, cast_options)?;
344    let dict = dict
345        .as_dictionary::<K>()
346        .downcast_dict::<PrimitiveArray<D>>()
347        .ok_or_else(|| {
348            ArrowError::ComputeError(format!(
349                "Internal Error: Cannot cast dict to {}Array",
350                D::PREFIX
351            ))
352        })?;
353    let value = dict.values().clone();
354    // Set correct precision/scale
355    let value = value.with_precision_and_scale(precision, scale)?;
356    Ok(Arc::new(DictionaryArray::<K>::try_new(
357        dict.keys().clone(),
358        Arc::new(value),
359    )?))
360}
361
362pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
363    array: &dyn Array,
364) -> Result<ArrayRef, ArrowError>
365where
366    K: ArrowDictionaryKeyType,
367{
368    let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
369        array.len(),
370        1024,
371        1024,
372    );
373    let string_view = array
374        .as_any()
375        .downcast_ref::<StringViewArray>()
376        .ok_or_else(|| {
377            ArrowError::ComputeError("Internal Error: Cannot cast to StringViewArray".to_string())
378        })?;
379    for v in string_view.iter() {
380        match v {
381            Some(v) => {
382                b.append(v)?;
383            }
384            None => {
385                b.append_null();
386            }
387        }
388    }
389
390    Ok(Arc::new(b.finish()))
391}
392
393pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
394    array: &dyn Array,
395) -> Result<ArrayRef, ArrowError>
396where
397    K: ArrowDictionaryKeyType,
398{
399    let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
400        array.len(),
401        1024,
402        1024,
403    );
404    let binary_view = array
405        .as_any()
406        .downcast_ref::<BinaryViewArray>()
407        .ok_or_else(|| {
408            ArrowError::ComputeError("Internal Error: Cannot cast to BinaryViewArray".to_string())
409        })?;
410    for v in binary_view.iter() {
411        match v {
412            Some(v) => {
413                b.append(v)?;
414            }
415            None => {
416                b.append_null();
417            }
418        }
419    }
420
421    Ok(Arc::new(b.finish()))
422}
423
424// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
425// key types of K
426pub(crate) fn pack_byte_to_dictionary<K, T>(
427    array: &dyn Array,
428    cast_options: &CastOptions,
429) -> Result<ArrayRef, ArrowError>
430where
431    K: ArrowDictionaryKeyType,
432    T: ByteArrayType,
433{
434    let cast_values = cast_with_options(array, &T::DATA_TYPE, cast_options)?;
435    let values = cast_values
436        .as_any()
437        .downcast_ref::<GenericByteArray<T>>()
438        .ok_or_else(|| {
439            ArrowError::ComputeError("Internal Error: Cannot cast to GenericByteArray".to_string())
440        })?;
441    let mut b = GenericByteDictionaryBuilder::<K, T>::with_capacity(values.len(), 1024, 1024);
442
443    // copy each element one at a time
444    for i in 0..values.len() {
445        if values.is_null(i) {
446            b.append_null();
447        } else {
448            b.append(values.value(i))?;
449        }
450    }
451    Ok(Arc::new(b.finish()))
452}