Merging PR_218 openai_rev package with new streamlit chat app
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/engine/substrait/extension_set.h"
|
||||
#include "arrow/engine/substrait/extension_types.h"
|
||||
#include "arrow/engine/substrait/relation.h"
|
||||
#include "arrow/engine/substrait/serde.h"
|
||||
@@ -0,0 +1,458 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/api_aggregate.h"
|
||||
#include "arrow/compute/exec/expression.h"
|
||||
#include "arrow/engine/substrait/type_fwd.h"
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
constexpr const char* kSubstraitArithmeticFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_arithmetic.yaml";
|
||||
constexpr const char* kSubstraitBooleanFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_boolean.yaml";
|
||||
constexpr const char* kSubstraitComparisonFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_comparison.yaml";
|
||||
constexpr const char* kSubstraitDatetimeFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_datetime.yaml";
|
||||
constexpr const char* kSubstraitLogarithmicFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_logarithmic.yaml";
|
||||
constexpr const char* kSubstraitRoundingFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_rounding.yaml";
|
||||
constexpr const char* kSubstraitStringFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_string.yaml";
|
||||
constexpr const char* kSubstraitAggregateGenericFunctionsUri =
|
||||
"https://github.com/substrait-io/substrait/blob/main/extensions/"
|
||||
"functions_aggregate_generic.yaml";
|
||||
|
||||
struct Id {
|
||||
std::string_view uri, name;
|
||||
bool empty() const { return uri.empty() && name.empty(); }
|
||||
std::string ToString() const;
|
||||
};
|
||||
struct IdHashEq {
|
||||
size_t operator()(Id id) const;
|
||||
bool operator()(Id l, Id r) const;
|
||||
};
|
||||
|
||||
/// \brief Owning storage for ids
|
||||
///
|
||||
/// Substrait plans may reuse URIs and names in many places. For convenience
|
||||
/// and performance Substarit ids are typically passed around as views. As we
|
||||
/// convert a plan from Substrait to Arrow we need to copy these strings out of
|
||||
/// the Substrait buffer and into owned storage. This class serves as that owned
|
||||
/// storage.
|
||||
class IdStorage {
|
||||
public:
|
||||
virtual ~IdStorage() = default;
|
||||
/// \brief Get an equivalent id pointing into this storage
|
||||
///
|
||||
/// This operation will copy the ids into storage if they do not already exist
|
||||
virtual Id Emplace(Id id) = 0;
|
||||
/// \brief Get an equivalent view pointing into this storage for a URI
|
||||
///
|
||||
/// If no URI is found then the uri will be copied into storage
|
||||
virtual std::string_view EmplaceUri(std::string_view uri) = 0;
|
||||
/// \brief Get an equivalent id pointing into this storage
|
||||
///
|
||||
/// If no id is found then nullopt will be returned
|
||||
virtual std::optional<Id> Find(Id id) const = 0;
|
||||
/// \brief Get an equivalent view pointing into this storage for a URI
|
||||
///
|
||||
/// If no URI is found then nullopt will be returned
|
||||
virtual std::optional<std::string_view> FindUri(std::string_view uri) const = 0;
|
||||
|
||||
static std::unique_ptr<IdStorage> Make();
|
||||
};
|
||||
|
||||
/// \brief Describes a Substrait call
|
||||
///
|
||||
/// Substrait call expressions contain a list of arguments which can either
|
||||
/// be enum arguments (which are serialized as strings), value arguments (which)
|
||||
/// are Arrow expressions, or type arguments (not yet implemented)
|
||||
class SubstraitCall {
|
||||
public:
|
||||
SubstraitCall(Id id, std::shared_ptr<DataType> output_type, bool output_nullable,
|
||||
bool is_hash = false)
|
||||
: id_(id),
|
||||
output_type_(std::move(output_type)),
|
||||
output_nullable_(output_nullable),
|
||||
is_hash_(is_hash) {}
|
||||
|
||||
const Id& id() const { return id_; }
|
||||
const std::shared_ptr<DataType>& output_type() const { return output_type_; }
|
||||
bool output_nullable() const { return output_nullable_; }
|
||||
bool is_hash() const { return is_hash_; }
|
||||
|
||||
bool HasEnumArg(int index) const;
|
||||
Result<std::string_view> GetEnumArg(int index) const;
|
||||
void SetEnumArg(int index, std::string enum_arg);
|
||||
Result<compute::Expression> GetValueArg(int index) const;
|
||||
bool HasValueArg(int index) const;
|
||||
void SetValueArg(int index, compute::Expression value_arg);
|
||||
std::optional<std::vector<std::string> const*> GetOption(
|
||||
std::string_view option_name) const;
|
||||
void SetOption(std::string_view option_name,
|
||||
const std::vector<std::string_view>& option_preferences);
|
||||
int size() const { return size_; }
|
||||
|
||||
private:
|
||||
Id id_;
|
||||
std::shared_ptr<DataType> output_type_;
|
||||
bool output_nullable_;
|
||||
// Only needed when converting from Substrait -> Arrow aggregates. The
|
||||
// Arrow function name depends on whether or not there are any groups
|
||||
bool is_hash_;
|
||||
std::unordered_map<int, std::string> enum_args_;
|
||||
std::unordered_map<int, compute::Expression> value_args_;
|
||||
std::unordered_map<std::string, std::vector<std::string>> options_;
|
||||
int size_ = 0;
|
||||
};
|
||||
|
||||
/// Substrait identifies functions and custom data types using a (uri, name) pair.
|
||||
///
|
||||
/// This registry is a bidirectional mapping between Substrait IDs and their
|
||||
/// corresponding Arrow counterparts (arrow::DataType and function names in a function
|
||||
/// registry)
|
||||
///
|
||||
/// Substrait extension types and variations must be registered with their
|
||||
/// corresponding arrow::DataType before they can be used!
|
||||
///
|
||||
/// Conceptually this can be thought of as two pairs of `unordered_map`s. One pair to
|
||||
/// go back and forth between Substrait ID and arrow::DataType and another pair to go
|
||||
/// back and forth between Substrait ID and Arrow function names.
|
||||
///
|
||||
/// Unlike an ExtensionSet this registry is not created automatically when consuming
|
||||
/// Substrait plans and must be configured ahead of time (although there is a default
|
||||
/// instance).
|
||||
class ARROW_ENGINE_EXPORT ExtensionIdRegistry {
|
||||
public:
|
||||
using ArrowToSubstraitCall =
|
||||
std::function<Result<SubstraitCall>(const arrow::compute::Expression::Call&)>;
|
||||
using SubstraitCallToArrow =
|
||||
std::function<Result<arrow::compute::Expression>(const SubstraitCall&)>;
|
||||
using ArrowToSubstraitAggregate =
|
||||
std::function<Result<SubstraitCall>(const arrow::compute::Aggregate&)>;
|
||||
using SubstraitAggregateToArrow =
|
||||
std::function<Result<arrow::compute::Aggregate>(const SubstraitCall&)>;
|
||||
|
||||
/// \brief A mapping between a Substrait ID and an arrow::DataType
|
||||
struct TypeRecord {
|
||||
Id id;
|
||||
const std::shared_ptr<DataType>& type;
|
||||
};
|
||||
|
||||
/// \brief Return a uri view owned by this registry
|
||||
///
|
||||
/// If the URI has never been emplaced it will return nullopt
|
||||
virtual std::optional<std::string_view> FindUri(std::string_view uri) const = 0;
|
||||
/// \brief Return a id view owned by this registry
|
||||
///
|
||||
/// If the id has never been emplaced it will return nullopt
|
||||
virtual std::optional<Id> FindId(Id id) const = 0;
|
||||
virtual std::optional<TypeRecord> GetType(const DataType&) const = 0;
|
||||
virtual std::optional<TypeRecord> GetType(Id) const = 0;
|
||||
virtual Status CanRegisterType(Id, const std::shared_ptr<DataType>& type) const = 0;
|
||||
virtual Status RegisterType(Id, std::shared_ptr<DataType>) = 0;
|
||||
/// \brief Register a converter that converts an Arrow call to a Substrait call
|
||||
///
|
||||
/// Note that there may not be 1:1 parity between ArrowToSubstraitCall and
|
||||
/// SubstraitCallToArrow because some standard functions (e.g. add) may map to
|
||||
/// multiple Arrow functions (e.g. add, add_checked)
|
||||
virtual Status AddArrowToSubstraitCall(std::string arrow_function_name,
|
||||
ArrowToSubstraitCall conversion_func) = 0;
|
||||
/// \brief Check to see if a converter can be registered
|
||||
///
|
||||
/// \return Status::OK if there are no conflicts, otherwise an error is returned
|
||||
virtual Status CanAddArrowToSubstraitCall(
|
||||
const std::string& arrow_function_name) const = 0;
|
||||
|
||||
/// \brief Register a converter that converts an Arrow aggregate to a Substrait
|
||||
/// aggregate
|
||||
virtual Status AddArrowToSubstraitAggregate(
|
||||
std::string arrow_function_name, ArrowToSubstraitAggregate conversion_func) = 0;
|
||||
/// \brief Check to see if a converter can be registered
|
||||
///
|
||||
/// \return Status::OK if there are no conflicts, otherwise an error is returned
|
||||
virtual Status CanAddArrowToSubstraitAggregate(
|
||||
const std::string& arrow_function_name) const = 0;
|
||||
|
||||
/// \brief Register a converter that converts a Substrait call to an Arrow call
|
||||
virtual Status AddSubstraitCallToArrow(Id substrait_function_id,
|
||||
SubstraitCallToArrow conversion_func) = 0;
|
||||
/// \brief Check to see if a converter can be registered
|
||||
///
|
||||
/// \return Status::OK if there are no conflicts, otherwise an error is returned
|
||||
virtual Status CanAddSubstraitCallToArrow(Id substrait_function_id) const = 0;
|
||||
/// \brief Register a simple mapping function
|
||||
///
|
||||
/// All calls to the function must pass only value arguments. The arguments
|
||||
/// will be converted to expressions and passed to the Arrow function
|
||||
virtual Status AddSubstraitCallToArrow(Id substrait_function_id,
|
||||
std::string arrow_function_name) = 0;
|
||||
|
||||
/// \brief Register a converter that converts a Substrait aggregate to an Arrow
|
||||
/// aggregate
|
||||
virtual Status AddSubstraitAggregateToArrow(
|
||||
Id substrait_function_id, SubstraitAggregateToArrow conversion_func) = 0;
|
||||
/// \brief Check to see if a converter can be registered
|
||||
///
|
||||
/// \return Status::OK if there are no conflicts, otherwise an error is returned
|
||||
virtual Status CanAddSubstraitAggregateToArrow(Id substrait_function_id) const = 0;
|
||||
|
||||
/// \brief Return a list of Substrait functions that have a converter
|
||||
///
|
||||
/// The function ids are encoded as strings using the pattern {uri}#{name}
|
||||
virtual std::vector<std::string> GetSupportedSubstraitFunctions() const = 0;
|
||||
|
||||
/// \brief Find a converter to map Arrow calls to Substrait calls
|
||||
/// \return A converter function or an invalid status if no converter is registered
|
||||
virtual Result<ArrowToSubstraitCall> GetArrowToSubstraitCall(
|
||||
const std::string& arrow_function_name) const = 0;
|
||||
|
||||
/// \brief Find a converter to map Arrow aggregates to Substrait aggregates
|
||||
/// \return A converter function or an invalid status if no converter is registered
|
||||
virtual Result<ArrowToSubstraitAggregate> GetArrowToSubstraitAggregate(
|
||||
const std::string& arrow_function_name) const = 0;
|
||||
|
||||
/// \brief Find a converter to map a Substrait aggregate to an Arrow aggregate
|
||||
/// \return A converter function or an invalid status if no converter is registered
|
||||
virtual Result<SubstraitAggregateToArrow> GetSubstraitAggregateToArrow(
|
||||
Id substrait_function_id) const = 0;
|
||||
|
||||
/// \brief Find a converter to map a Substrait call to an Arrow call
|
||||
/// \return A converter function or an invalid status if no converter is registered
|
||||
virtual Result<SubstraitCallToArrow> GetSubstraitCallToArrow(
|
||||
Id substrait_function_id) const = 0;
|
||||
|
||||
/// \brief Similar to \see GetSubstraitCallToArrow but only uses the name
|
||||
///
|
||||
/// There may be multiple functions with the same name and this will return
|
||||
/// the first. This is slower than GetSubstraitCallToArrow and should only
|
||||
/// be used when the plan does not include a URI (or the URI is "/")
|
||||
virtual Result<SubstraitCallToArrow> GetSubstraitCallToArrowFallback(
|
||||
std::string_view function_name) const = 0;
|
||||
|
||||
/// \brief Similar to \see GetSubstraitAggregateToArrow but only uses the name
|
||||
///
|
||||
/// \see GetSubstraitCallToArrowFallback for details on the fallback behavior
|
||||
virtual Result<SubstraitAggregateToArrow> GetSubstraitAggregateToArrowFallback(
|
||||
std::string_view function_name) const = 0;
|
||||
};
|
||||
|
||||
constexpr std::string_view kArrowExtTypesUri =
|
||||
"https://github.com/apache/arrow/blob/master/format/substrait/"
|
||||
"extension_types.yaml";
|
||||
|
||||
/// A default registry with all supported functions and data types registered
|
||||
///
|
||||
/// Note: Function support is currently very minimal, see ARROW-15538
|
||||
ARROW_ENGINE_EXPORT ExtensionIdRegistry* default_extension_id_registry();
|
||||
|
||||
/// \brief Make a nested registry with a given parent.
|
||||
///
|
||||
/// A nested registry supports registering types and functions other and on top of those
|
||||
/// already registered in its parent registry. No conflicts in IDs and names used for
|
||||
/// lookup are allowed. Normally, the given parent is the default registry.
|
||||
///
|
||||
/// One use case for a nested registry is for dynamic registration of functions defined
|
||||
/// within a Substrait plan while keeping these registrations specific to the plan. When
|
||||
/// the Substrait plan is disposed of, normally after its execution, the nested registry
|
||||
/// can be disposed of as well.
|
||||
ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionIdRegistry> nested_extension_id_registry(
|
||||
const ExtensionIdRegistry* parent);
|
||||
|
||||
/// \brief A set of extensions used within a plan
|
||||
///
|
||||
/// Each time an extension is used within a Substrait plan the extension
|
||||
/// must be included in an extension set that is defined at the root of the
|
||||
/// plan.
|
||||
///
|
||||
/// The plan refers to a specific extension using an "anchor" which is an
|
||||
/// arbitrary integer invented by the producer that has no meaning beyond a
|
||||
/// plan but which should be consistent within a plan.
|
||||
///
|
||||
/// To support serialization and deserialization this type serves as a
|
||||
/// bidirectional map between Substrait ID and "anchor"s.
|
||||
///
|
||||
/// When deserializing a Substrait plan the extension set should be extracted
|
||||
/// after the plan has been converted from Protobuf and before the plan
|
||||
/// is converted to an execution plan.
|
||||
///
|
||||
/// The extension set can be kept and reused during serialization if a perfect
|
||||
/// round trip is required. If serialization is not needed or round tripping
|
||||
/// is not required then the extension set can be safely discarded after the
|
||||
/// plan has been converted into an execution plan.
|
||||
///
|
||||
/// When converting an execution plan into a Substrait plan an extension set
|
||||
/// can be automatically generated or a previously generated extension set can
|
||||
/// be used.
|
||||
///
|
||||
/// ExtensionSet does not own strings; it only refers to strings in an
|
||||
/// ExtensionIdRegistry.
|
||||
class ARROW_ENGINE_EXPORT ExtensionSet {
|
||||
public:
|
||||
struct FunctionRecord {
|
||||
Id id;
|
||||
std::string_view name;
|
||||
};
|
||||
|
||||
struct TypeRecord {
|
||||
Id id;
|
||||
std::shared_ptr<DataType> type;
|
||||
};
|
||||
|
||||
/// Construct an empty ExtensionSet to be populated during serialization.
|
||||
explicit ExtensionSet(const ExtensionIdRegistry* = default_extension_id_registry());
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
|
||||
|
||||
/// Construct an ExtensionSet with explicit extension ids for efficient referencing
|
||||
/// during deserialization. Note that input vectors need not be densely packed; an empty
|
||||
/// (default constructed) Id may be used as a placeholder to indicate an unused
|
||||
/// _anchor/_reference. This factory will be used to wrap the extensions declared in a
|
||||
/// substrait::Plan before deserializing the plan's relations.
|
||||
///
|
||||
/// Views will be replaced with equivalent views pointing to memory owned by the
|
||||
/// registry.
|
||||
///
|
||||
/// Note: This is an advanced operation. The order of the ids, types, and functions
|
||||
/// must match the anchor numbers chosen for a plan.
|
||||
///
|
||||
/// An extension set should instead be created using
|
||||
/// arrow::engine::GetExtensionSetFromPlan
|
||||
static Result<ExtensionSet> Make(
|
||||
std::unordered_map<uint32_t, std::string_view> uris,
|
||||
std::unordered_map<uint32_t, Id> type_ids,
|
||||
std::unordered_map<uint32_t, Id> function_ids,
|
||||
const ConversionOptions& conversion_options,
|
||||
const ExtensionIdRegistry* = default_extension_id_registry());
|
||||
|
||||
const std::unordered_map<uint32_t, std::string_view>& uris() const { return uris_; }
|
||||
|
||||
/// \brief Returns a data type given an anchor
|
||||
///
|
||||
/// This is used when converting a Substrait plan to an Arrow execution plan.
|
||||
///
|
||||
/// If the anchor does not exist in this extension set an error will be returned.
|
||||
Result<TypeRecord> DecodeType(uint32_t anchor) const;
|
||||
|
||||
/// \brief Returns the number of custom type records in this extension set
|
||||
///
|
||||
/// Note: the types are currently stored as a sparse vector, so this may return a value
|
||||
/// larger than the actual number of types. This behavior may change in the future; see
|
||||
/// ARROW-15583.
|
||||
std::size_t num_types() const { return types_.size(); }
|
||||
|
||||
/// \brief Lookup the anchor for a given type
|
||||
///
|
||||
/// This operation is used when converting an Arrow execution plan to a Substrait plan.
|
||||
/// If the type has been previously encoded then the same anchor value will returned.
|
||||
///
|
||||
/// If the type has not been previously encoded then a new anchor value will be created.
|
||||
///
|
||||
/// If the type does not exist in the extension id registry then an error will be
|
||||
/// returned.
|
||||
///
|
||||
/// \return An anchor that can be used to refer to the type within a plan
|
||||
Result<uint32_t> EncodeType(const DataType& type);
|
||||
|
||||
/// \brief Return a function id given an anchor
|
||||
///
|
||||
/// This is used when converting a Substrait plan to an Arrow execution plan.
|
||||
///
|
||||
/// If the anchor does not exist in this extension set an error will be returned.
|
||||
Result<Id> DecodeFunction(uint32_t anchor) const;
|
||||
|
||||
/// \brief Lookup the anchor for a given function
|
||||
///
|
||||
/// This operation is used when converting an Arrow execution plan to a Substrait plan.
|
||||
/// If the function has been previously encoded then the same anchor value will be
|
||||
/// returned.
|
||||
///
|
||||
/// If the function has not been previously encoded then a new anchor value will be
|
||||
/// created.
|
||||
///
|
||||
/// If the function name is not in the extension id registry then an error will be
|
||||
/// returned.
|
||||
///
|
||||
/// \return An anchor that can be used to refer to the function within a plan
|
||||
Result<uint32_t> EncodeFunction(Id function_id);
|
||||
|
||||
/// \brief Return the number of custom functions in this extension set
|
||||
std::size_t num_functions() const { return functions_.size(); }
|
||||
|
||||
const ExtensionIdRegistry* registry() const { return registry_; }
|
||||
|
||||
private:
|
||||
const ExtensionIdRegistry* registry_;
|
||||
// If the registry is not aware of an id then we probably can't do anything
|
||||
// with it. However, in some cases, these may represent extensions or features
|
||||
// that we can safely ignore. For example, we can usually safely ignore
|
||||
// extension type variations if we assume the plan is valid. These ignorable
|
||||
// ids are stored here.
|
||||
std::unique_ptr<IdStorage> plan_specific_ids_ = IdStorage::Make();
|
||||
|
||||
// Map from anchor values to URI values referenced by this extension set
|
||||
std::unordered_map<uint32_t, std::string_view> uris_;
|
||||
// Map from anchor values to type definitions, used during Substrait->Arrow
|
||||
// and populated from the Substrait extension set
|
||||
std::unordered_map<uint32_t, TypeRecord> types_;
|
||||
// Map from anchor values to function ids, used during Substrait->Arrow
|
||||
// and populated from the Substrait extension set
|
||||
std::unordered_map<uint32_t, Id> functions_;
|
||||
// Map from type names to anchor values. Used during Arrow->Substrait
|
||||
// and built as the plan is created.
|
||||
std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_;
|
||||
// Map from function names to anchor values. Used during Arrow->Substrait
|
||||
// and built as the plan is created.
|
||||
std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> functions_map_;
|
||||
|
||||
Status CheckHasUri(std::string_view uri);
|
||||
void AddUri(std::pair<uint32_t, std::string_view> uri);
|
||||
Status AddUri(Id id);
|
||||
};
|
||||
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,80 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
// arrow::ExtensionTypes are provided to wrap uuid, fixed_char, varchar, interval_year,
|
||||
// and interval_day which are first-class types in substrait but do not appear in
|
||||
// the arrow type system.
|
||||
//
|
||||
// Note that these are not automatically registered with arrow::RegisterExtensionType(),
|
||||
// which means among other things that serialization of these types to IPC would fail.
|
||||
|
||||
/// fixed_size_binary(16) for storing Universally Unique IDentifiers
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::shared_ptr<DataType> uuid();
|
||||
|
||||
/// fixed_size_binary(length) constrained to contain only valid UTF-8
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::shared_ptr<DataType> fixed_char(int32_t length);
|
||||
|
||||
/// utf8() constrained to be shorter than `length`
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::shared_ptr<DataType> varchar(int32_t length);
|
||||
|
||||
/// fixed_size_list(int32(), 2) storing a number of [years, months]
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::shared_ptr<DataType> interval_year();
|
||||
|
||||
/// fixed_size_list(int32(), 2) storing a number of [days, seconds]
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::shared_ptr<DataType> interval_day();
|
||||
|
||||
/// Return true if t is Uuid, otherwise false
|
||||
ARROW_ENGINE_EXPORT
|
||||
bool UnwrapUuid(const DataType&);
|
||||
|
||||
/// Return FixedChar length if t is FixedChar, otherwise nullopt
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::optional<int32_t> UnwrapFixedChar(const DataType&);
|
||||
|
||||
/// Return Varchar (max) length if t is VarChar, otherwise nullopt
|
||||
ARROW_ENGINE_EXPORT
|
||||
std::optional<int32_t> UnwrapVarChar(const DataType& t);
|
||||
|
||||
/// Return true if t is IntervalYear, otherwise false
|
||||
ARROW_ENGINE_EXPORT
|
||||
bool UnwrapIntervalYear(const DataType&);
|
||||
|
||||
/// Return true if t is IntervalDay, otherwise false
|
||||
ARROW_ENGINE_EXPORT
|
||||
bool UnwrapIntervalDay(const DataType&);
|
||||
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,88 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/engine/substrait/type_fwd.h"
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
/// How strictly to adhere to the input structure when converting between Substrait and
|
||||
/// Acero representations of a plan. This allows the user to trade conversion accuracy
|
||||
/// for performance and lenience.
|
||||
enum class ARROW_ENGINE_EXPORT ConversionStrictness {
|
||||
/// When a primitive is used at the input that doesn't have an exact match at the
|
||||
/// output, reject the conversion. This effectively asserts that there is no (known)
|
||||
/// information loss in the conversion, and that plans should either round-trip back and
|
||||
/// forth exactly or not at all. This option is primarily intended for testing and
|
||||
/// debugging.
|
||||
EXACT_ROUNDTRIP,
|
||||
|
||||
/// When a primitive is used at the input that doesn't have an exact match at the
|
||||
/// output, attempt to model it with some collection of primitives at the output. This
|
||||
/// means that even if the incoming plan is completely optimal by some metric, the
|
||||
/// returned plan is fairly likely to not be optimal anymore, and round-trips back and
|
||||
/// forth may make the plan increasingly suboptimal. However, every primitive at the
|
||||
/// output can be (manually) traced back to exactly one primitive at the input, which
|
||||
/// may be useful when debugging.
|
||||
PRESERVE_STRUCTURE,
|
||||
|
||||
/// Behaves like PRESERVE_STRUCTURE, but prefers performance over structural accuracy.
|
||||
/// Basic optimizations *may* be applied, in order to attempt to not regress in terms of
|
||||
/// plan performance: if the incoming plan was already aggressively optimized, the goal
|
||||
/// is for the output plan to not be less performant. In practical use cases, this is
|
||||
/// probably the option you want.
|
||||
///
|
||||
/// Note that no guarantees are made on top of PRESERVE_STRUCTURE. Past and future
|
||||
/// versions of Arrow may even ignore this option entirely and treat it exactly like
|
||||
/// PRESERVE_STRUCTURE.
|
||||
BEST_EFFORT,
|
||||
};
|
||||
|
||||
using NamedTableProvider =
|
||||
std::function<Result<compute::Declaration>(const std::vector<std::string>&)>;
|
||||
static NamedTableProvider kDefaultNamedTableProvider;
|
||||
|
||||
class ExtensionProvider;
|
||||
|
||||
ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionProvider> default_extension_provider();
|
||||
|
||||
/// Options that control the conversion between Substrait and Acero representations of a
|
||||
/// plan.
|
||||
struct ARROW_ENGINE_EXPORT ConversionOptions {
|
||||
/// \brief How strictly the converter should adhere to the structure of the input.
|
||||
ConversionStrictness strictness = ConversionStrictness::BEST_EFFORT;
|
||||
/// \brief A custom strategy to be used for providing named tables
|
||||
///
|
||||
/// The default behavior will return an invalid status if the plan has any
|
||||
/// named table relations.
|
||||
NamedTableProvider named_table_provider = kDefaultNamedTableProvider;
|
||||
std::shared_ptr<ExtensionProvider> extension_provider = default_extension_provider();
|
||||
};
|
||||
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,23 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Often-used headers, for precompiling.
|
||||
// If updating this header, please make sure you check compilation speed
|
||||
// before checking in. Adding headers which are not used extremely often
|
||||
// may incur a slowdown, since it makes the precompiled header heavier to load.
|
||||
|
||||
#include "arrow/pch.h"
|
||||
@@ -0,0 +1,38 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/compute/exec/exec_plan.h"
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
/// Information resulting from converting a Substrait relation.
|
||||
struct ARROW_ENGINE_EXPORT DeclarationInfo {
|
||||
/// The compute declaration produced thus far.
|
||||
compute::Declaration declaration;
|
||||
|
||||
std::shared_ptr<Schema> output_schema;
|
||||
};
|
||||
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,304 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/dataset/type_fwd.h"
|
||||
#include "arrow/engine/substrait/options.h"
|
||||
#include "arrow/engine/substrait/type_fwd.h"
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
/// \brief Serialize an Acero Plan to a binary protobuf Substrait message
|
||||
///
|
||||
/// \param[in] declaration the Acero declaration to serialize.
|
||||
/// This declaration is the sink relation of the Acero plan.
|
||||
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
|
||||
/// \param[in] conversion_options options to control how the conversion is done
|
||||
///
|
||||
/// \return a buffer containing the protobuf serialization of the Acero relation
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializePlan(
|
||||
const compute::Declaration& declaration, ExtensionSet* ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// Factory function type for generating the node that consumes the batches produced by
|
||||
/// each toplevel Substrait relation when deserializing a Substrait Plan.
|
||||
using ConsumerFactory = std::function<std::shared_ptr<compute::SinkNodeConsumer>()>;
|
||||
|
||||
/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
|
||||
///
|
||||
/// The output of each top-level Substrait relation will be sent to a caller supplied
|
||||
/// consumer function provided by consumer_factory
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
|
||||
/// message
|
||||
/// \param[in] consumer_factory factory function for generating the node that consumes
|
||||
/// the batches produced by each toplevel Substrait relation
|
||||
/// \param[in] registry an extension-id-registry to use, or null for the default one.
|
||||
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
|
||||
/// Plan is returned here.
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a vector of ExecNode declarations, one for each toplevel relation in the
|
||||
/// Substrait Plan
|
||||
ARROW_ENGINE_EXPORT Result<std::vector<compute::Declaration>> DeserializePlans(
|
||||
const Buffer& buf, const ConsumerFactory& consumer_factory,
|
||||
const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
|
||||
///
|
||||
/// The output of each top-level Substrait relation will be sent to a caller supplied
|
||||
/// consumer function provided by consumer_factory
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
|
||||
/// message
|
||||
/// \param[in] consumer node that consumes the batches produced by each toplevel Substrait
|
||||
/// relation
|
||||
/// \param[in] registry an extension-id-registry to use, or null for the default one.
|
||||
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// Plan is returned here.
|
||||
/// \return an ExecNode corresponding to the single toplevel relation in the Substrait
|
||||
/// Plan
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<compute::ExecPlan>> DeserializePlan(
|
||||
const Buffer& buf, const std::shared_ptr<compute::SinkNodeConsumer>& consumer,
|
||||
const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// Factory function type for generating the write options of a node consuming the batches
|
||||
/// produced by each toplevel Substrait relation when deserializing a Substrait Plan.
|
||||
using WriteOptionsFactory = std::function<std::shared_ptr<dataset::WriteNodeOptions>()>;
|
||||
|
||||
/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
|
||||
///
|
||||
/// The output of each top-level Substrait relation will be written to a filesystem.
|
||||
/// `write_options_factory` can be used to control write behavior.
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
|
||||
/// message
|
||||
/// \param[in] write_options_factory factory function for generating the write options of
|
||||
/// a node consuming the batches produced by each toplevel Substrait relation
|
||||
/// \param[in] registry an extension-id-registry to use, or null for the default one.
|
||||
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
|
||||
/// Plan is returned here.
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a vector of ExecNode declarations, one for each toplevel relation in the
|
||||
/// Substrait Plan
|
||||
ARROW_ENGINE_EXPORT Result<std::vector<compute::Declaration>> DeserializePlans(
|
||||
const Buffer& buf, const WriteOptionsFactory& write_options_factory,
|
||||
const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
|
||||
///
|
||||
/// The output of the single Substrait relation will be written to a filesystem.
|
||||
/// `write_options_factory` can be used to control write behavior.
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
|
||||
/// message
|
||||
/// \param[in] write_options write options of a node consuming the batches produced by
|
||||
/// each toplevel Substrait relation
|
||||
/// \param[in] registry an extension-id-registry to use, or null for the default one.
|
||||
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
|
||||
/// Plan is returned here.
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a vector of ExecNode declarations, one for each toplevel relation in the
|
||||
/// Substrait Plan
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<compute::ExecPlan>> DeserializePlan(
|
||||
const Buffer& buf, const std::shared_ptr<dataset::WriteNodeOptions>& write_options,
|
||||
const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a Substrait Plan message to a Declaration
|
||||
///
|
||||
/// The plan will not contain any sink nodes and will be suitable for use in any
|
||||
/// of the arrow::compute::DeclarationToXyz methods.
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
|
||||
/// message
|
||||
/// \param[in] registry an extension-id-registry to use, or null for the default one.
|
||||
/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
|
||||
/// Plan is returned here.
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return A declaration representing the Substrait plan
|
||||
ARROW_ENGINE_EXPORT Result<compute::Declaration> DeserializePlan(
|
||||
const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR,
|
||||
ExtensionSet* ext_set_out = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a Substrait Type message to the corresponding Arrow type
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Type
|
||||
/// message
|
||||
/// \param[in] ext_set the extension mapping to use, normally provided by the
|
||||
/// surrounding Plan message
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return the corresponding Arrow data type
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<DataType>> DeserializeType(
|
||||
const Buffer& buf, const ExtensionSet& ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Serializes an Arrow type to a Substrait Type message
|
||||
///
|
||||
/// \param[in] type the Arrow data type to serialize
|
||||
/// \param[in,out] ext_set the extension mapping to use; may be updated to add a
|
||||
/// mapping for the given type
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
|
||||
/// Type message
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeType(
|
||||
const DataType& type, ExtensionSet* ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a Substrait NamedStruct message to an Arrow schema
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
|
||||
/// NamedStruct message
|
||||
/// \param[in] ext_set the extension mapping to use, normally provided by the
|
||||
/// surrounding Plan message
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return the corresponding Arrow schema
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Schema>> DeserializeSchema(
|
||||
const Buffer& buf, const ExtensionSet& ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Serializes an Arrow schema to a Substrait NamedStruct message
|
||||
///
|
||||
/// \param[in] schema the Arrow schema to serialize
|
||||
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
|
||||
/// mappings for the types used in the schema
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
|
||||
/// NamedStruct message
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeSchema(
|
||||
const Schema& schema, ExtensionSet* ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a Substrait Expression message to a compute expression
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
|
||||
/// Expression message
|
||||
/// \param[in] ext_set the extension mapping to use, normally provided by the
|
||||
/// surrounding Plan message
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return the corresponding Arrow compute expression
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<compute::Expression> DeserializeExpression(
|
||||
const Buffer& buf, const ExtensionSet& ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Serializes an Arrow compute expression to a Substrait Expression message
|
||||
///
|
||||
/// \param[in] expr the Arrow compute expression to serialize
|
||||
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
|
||||
/// mappings for the types used in the expression
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return a buffer containing the protobuf serialization of the corresponding Substrait
|
||||
/// Expression message
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeExpression(
|
||||
const compute::Expression& expr, ExtensionSet* ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Serialize an Acero Declaration to a binary protobuf Substrait message
|
||||
///
|
||||
/// \param[in] declaration the Acero declaration to serialize
|
||||
/// \param[in,out] ext_set the extension mapping to use; may be updated to add
|
||||
/// \param[in] conversion_options options to control how the conversion is done
|
||||
///
|
||||
/// \return a buffer containing the protobuf serialization of the Acero relation
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> SerializeRelation(
|
||||
const compute::Declaration& declaration, ExtensionSet* ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
/// \brief Deserializes a Substrait Rel (relation) message to an ExecNode declaration
|
||||
///
|
||||
/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
|
||||
/// Rel message
|
||||
/// \param[in] ext_set the extension mapping to use, normally provided by the
|
||||
/// surrounding Plan message
|
||||
/// \param[in] conversion_options options to control how the conversion is to be done.
|
||||
/// \return the corresponding ExecNode declaration
|
||||
ARROW_ENGINE_EXPORT Result<compute::Declaration> DeserializeRelation(
|
||||
const Buffer& buf, const ExtensionSet& ext_set,
|
||||
const ConversionOptions& conversion_options = {});
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Checks whether two protobuf serializations of a particular Substrait message
|
||||
/// type are equivalent
|
||||
///
|
||||
/// Note that a binary comparison of the two buffers is insufficient. One reason for this
|
||||
/// is that the fields of a message can be specified in any order in the serialization.
|
||||
///
|
||||
/// \param[in] message_name the name of the Substrait message type to check
|
||||
/// \param[in] l_buf buffer containing the first protobuf serialization to compare
|
||||
/// \param[in] r_buf buffer containing the second protobuf serialization to compare
|
||||
/// \return success if equivalent, failure if not
|
||||
ARROW_ENGINE_EXPORT
|
||||
Status CheckMessagesEquivalent(std::string_view message_name, const Buffer& l_buf,
|
||||
const Buffer& r_buf);
|
||||
|
||||
/// \brief Utility function to convert a JSON serialization of a Substrait message to
|
||||
/// its binary serialization
|
||||
///
|
||||
/// \param[in] type_name the name of the Substrait message type to convert
|
||||
/// \param[in] json the JSON string to convert
|
||||
/// \param[in] ignore_unknown_fields if true then unknown fields will be ignored and
|
||||
/// will not cause an error
|
||||
///
|
||||
/// This should generally be true to allow consumption of plans from newer
|
||||
/// producers but setting to false can be useful if you are testing
|
||||
/// conformance to a specific Substrait version
|
||||
/// \return a buffer filled with the binary protobuf serialization of message
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SubstraitFromJSON(std::string_view type_name,
|
||||
std::string_view json,
|
||||
bool ignore_unknown_fields = true);
|
||||
|
||||
/// \brief Utility function to convert a binary protobuf serialization of a Substrait
|
||||
/// message to JSON
|
||||
///
|
||||
/// \param[in] type_name the name of the Substrait message type to convert
|
||||
/// \param[in] buf the buffer containing the binary protobuf serialization of the message
|
||||
/// \return a JSON string representing the message
|
||||
ARROW_ENGINE_EXPORT
|
||||
Result<std::string> SubstraitToJSON(std::string_view type_name, const Buffer& buf);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,75 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// These utilities are for internal / unit test use only.
|
||||
// They allow for the construction of simple Substrait plans
|
||||
// programmatically without first requiring the construction
|
||||
// of an ExecPlan
|
||||
|
||||
// These utilities have to be here, and not in a test_util.cc
|
||||
// file (or in a unit test) because only one .so is allowed
|
||||
// to include each .pb.h file or else protobuf will encounter
|
||||
// global namespace conflicts.
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
struct Id;
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Create a scan->project->sink plan for tests
|
||||
///
|
||||
/// The plan will project one additional column using the function
|
||||
/// defined by `function_id`, `arguments`, and data_types. `arguments`
|
||||
/// and `data_types` should have the same length but only one of each
|
||||
/// should be defined at each index.
|
||||
///
|
||||
/// If `data_types` is defined at an index then the plan will create a
|
||||
/// direct reference (starting at index 0 and increasing by 1 for each
|
||||
/// argument of this type).
|
||||
///
|
||||
/// If `arguments` is defined at an index then the plan will create an
|
||||
/// enum argument with that value.
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> CreateScanProjectSubstrait(
|
||||
Id function_id, const std::shared_ptr<Table>& input_table,
|
||||
const std::vector<std::string>& arguments,
|
||||
const std::unordered_map<std::string, std::vector<std::string>>& options,
|
||||
const std::vector<std::shared_ptr<DataType>>& data_types,
|
||||
const DataType& output_type);
|
||||
|
||||
/// \brief Create a scan->aggregate->sink plan for tests
|
||||
///
|
||||
/// The plan will create an aggregate with one grouping set (defined by
|
||||
/// key_idxs) and one measure. The measure will be a unary function
|
||||
/// defined by `function_id` and a direct reference to `arg_idx`.
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> CreateScanAggSubstrait(
|
||||
Id function_id, const std::shared_ptr<Table>& input_table,
|
||||
const std::vector<int>& key_idxs, int arg_idx, const DataType& output_type);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,32 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace arrow {
|
||||
namespace engine {
|
||||
|
||||
class ExtensionIdRegistry;
|
||||
class ExtensionSet;
|
||||
|
||||
struct ConversionOptions;
|
||||
struct DeclarationInfo;
|
||||
|
||||
} // namespace engine
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,73 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/compute/type_fwd.h"
|
||||
#include "arrow/engine/substrait/options.h"
|
||||
#include "arrow/engine/substrait/type_fwd.h"
|
||||
#include "arrow/engine/substrait/visibility.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/iterator.h"
|
||||
#include "arrow/util/macros.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace engine {
|
||||
|
||||
using PythonTableProvider =
|
||||
std::function<Result<std::shared_ptr<Table>>(const std::vector<std::string>&)>;
|
||||
|
||||
/// \brief Utility method to run a Substrait plan
|
||||
/// \param substrait_buffer The plan to run, must be in binary protobuf format
|
||||
/// \param registry A registry of extension functions to make available to the plan
|
||||
/// If null then the default registry will be used.
|
||||
/// \param memory_pool The memory pool the plan should use to make allocations.
|
||||
/// \param func_registry A registry of functions used for execution expressions.
|
||||
/// `registry` maps from Substrait function IDs to "names". These
|
||||
/// names will be provided to `func_registry` to get the actual
|
||||
/// kernel.
|
||||
/// \param conversion_options Options to control plan deserialization
|
||||
/// \param use_threads If True then the CPU thread pool will be used for CPU work. If
|
||||
/// False then all work will be done on the calling thread.
|
||||
/// \return A record batch reader that will read out the results
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<RecordBatchReader>> ExecuteSerializedPlan(
|
||||
const Buffer& substrait_buffer, const ExtensionIdRegistry* registry = NULLPTR,
|
||||
compute::FunctionRegistry* func_registry = NULLPTR,
|
||||
const ConversionOptions& conversion_options = {}, bool use_threads = true,
|
||||
MemoryPool* memory_pool = default_memory_pool());
|
||||
|
||||
/// \brief Get a Serialized Plan from a Substrait JSON plan.
|
||||
/// This is a helper method for Python tests.
|
||||
ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> SerializeJsonPlan(
|
||||
const std::string& substrait_json);
|
||||
|
||||
/// \brief Make a nested registry with the default registry as parent.
|
||||
/// See arrow::engine::nested_extension_id_registry for details.
|
||||
ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionIdRegistry> MakeExtensionIdRegistry();
|
||||
|
||||
ARROW_ENGINE_EXPORT const std::string& default_extension_types_uri();
|
||||
|
||||
} // namespace engine
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,52 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// TODO(westonpace): Once we have a propert engine module this file
|
||||
// should be renamed arrow/engine/visibility.h
|
||||
// This API is EXPERIMENTAL.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(_WIN32) || defined(__CYGWIN__)
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(push)
|
||||
#pragma warning(disable : 4251)
|
||||
#else
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
#endif
|
||||
|
||||
#ifdef ARROW_ENGINE_STATIC
|
||||
#define ARROW_ENGINE_EXPORT
|
||||
#elif defined(ARROW_ENGINE_EXPORTING)
|
||||
#define ARROW_ENGINE_EXPORT __declspec(dllexport)
|
||||
#else
|
||||
#define ARROW_ENGINE_EXPORT __declspec(dllimport)
|
||||
#endif
|
||||
|
||||
#define ARROW_ENGINE_NO_EXPORT
|
||||
#else // Not Windows
|
||||
#ifndef ARROW_ENGINE_EXPORT
|
||||
#define ARROW_ENGINE_EXPORT __attribute__((visibility("default")))
|
||||
#endif
|
||||
#ifndef ARROW_ENGINE_NO_EXPORT
|
||||
#define ARROW_ENGINE_NO_EXPORT __attribute__((visibility("hidden")))
|
||||
#endif
|
||||
#endif // Non-Windows
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(pop)
|
||||
#endif
|
||||
Reference in New Issue
Block a user