Merging PR_218 openai_rev package with new streamlit chat app
This commit is contained in:
@@ -0,0 +1,25 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/ipc/dictionary.h"
|
||||
#include "arrow/ipc/feather.h"
|
||||
#include "arrow/ipc/json_simple.h"
|
||||
#include "arrow/ipc/message.h"
|
||||
#include "arrow/ipc/reader.h"
|
||||
#include "arrow/ipc/writer.h"
|
||||
@@ -0,0 +1,177 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Tools for dictionaries in IPC context
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
|
||||
namespace internal {
|
||||
|
||||
class FieldPosition {
|
||||
public:
|
||||
FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
|
||||
|
||||
FieldPosition child(int index) const { return {this, index}; }
|
||||
|
||||
std::vector<int> path() const {
|
||||
std::vector<int> path(depth_);
|
||||
const FieldPosition* cur = this;
|
||||
for (int i = depth_ - 1; i >= 0; --i) {
|
||||
path[i] = cur->index_;
|
||||
cur = cur->parent_;
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
protected:
|
||||
FieldPosition(const FieldPosition* parent, int index)
|
||||
: parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
|
||||
|
||||
const FieldPosition* parent_;
|
||||
int index_;
|
||||
int depth_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief Map fields in a schema to dictionary ids
|
||||
///
|
||||
/// The mapping is structural, i.e. the field path (as a vector of indices)
|
||||
/// is associated to the dictionary id. A dictionary id may be associated
|
||||
/// to multiple fields.
|
||||
class ARROW_EXPORT DictionaryFieldMapper {
|
||||
public:
|
||||
DictionaryFieldMapper();
|
||||
explicit DictionaryFieldMapper(const Schema& schema);
|
||||
~DictionaryFieldMapper();
|
||||
|
||||
Status AddSchemaFields(const Schema& schema);
|
||||
Status AddField(int64_t id, std::vector<int> field_path);
|
||||
|
||||
Result<int64_t> GetFieldId(std::vector<int> field_path) const;
|
||||
|
||||
int num_fields() const;
|
||||
|
||||
/// \brief Returns number of unique dictionaries, taking into
|
||||
/// account that different fields can share the same dictionary.
|
||||
int num_dicts() const;
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
using DictionaryVector = std::vector<std::pair<int64_t, std::shared_ptr<Array>>>;
|
||||
|
||||
/// \brief Memoization data structure for reading dictionaries from IPC streams
|
||||
///
|
||||
/// This structure tracks the following associations:
|
||||
/// - field position (structural) -> dictionary id
|
||||
/// - dictionary id -> value type
|
||||
/// - dictionary id -> dictionary (value) data
|
||||
///
|
||||
/// Together, they allow resolving dictionary data when reading an IPC stream,
|
||||
/// using metadata recorded in the schema message and data recorded in the
|
||||
/// dictionary batch messages (see ResolveDictionaries).
|
||||
///
|
||||
/// This structure isn't useful for writing an IPC stream, where only
|
||||
/// DictionaryFieldMapper is necessary.
|
||||
class ARROW_EXPORT DictionaryMemo {
|
||||
public:
|
||||
DictionaryMemo();
|
||||
~DictionaryMemo();
|
||||
|
||||
DictionaryFieldMapper& fields();
|
||||
const DictionaryFieldMapper& fields() const;
|
||||
|
||||
/// \brief Return current dictionary corresponding to a particular
|
||||
/// id. Returns KeyError if id not found
|
||||
Result<std::shared_ptr<ArrayData>> GetDictionary(int64_t id, MemoryPool* pool) const;
|
||||
|
||||
/// \brief Return dictionary value type corresponding to a
|
||||
/// particular dictionary id.
|
||||
Result<std::shared_ptr<DataType>> GetDictionaryType(int64_t id) const;
|
||||
|
||||
/// \brief Return true if we have a dictionary for the input id
|
||||
bool HasDictionary(int64_t id) const;
|
||||
|
||||
/// \brief Add a dictionary value type to the memo with a particular id.
|
||||
/// Returns KeyError if a different type is already registered with the same id.
|
||||
Status AddDictionaryType(int64_t id, const std::shared_ptr<DataType>& type);
|
||||
|
||||
/// \brief Add a dictionary to the memo with a particular id. Returns
|
||||
/// KeyError if that dictionary already exists
|
||||
Status AddDictionary(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
|
||||
|
||||
/// \brief Append a dictionary delta to the memo with a particular id. Returns
|
||||
/// KeyError if that dictionary does not exists
|
||||
Status AddDictionaryDelta(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
|
||||
|
||||
/// \brief Add a dictionary to the memo if it does not have one with the id,
|
||||
/// otherwise, replace the dictionary with the new one.
|
||||
///
|
||||
/// Return true if the dictionary was added, false if replaced.
|
||||
Result<bool> AddOrReplaceDictionary(int64_t id,
|
||||
const std::shared_ptr<ArrayData>& dictionary);
|
||||
|
||||
private:
|
||||
struct Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
};
|
||||
|
||||
// For writing: collect dictionary entries to write to the IPC stream, in order
|
||||
// (i.e. inner dictionaries before dependent outer dictionaries).
|
||||
ARROW_EXPORT
|
||||
Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch,
|
||||
const DictionaryFieldMapper& mapper);
|
||||
|
||||
// For reading: resolve all dictionaries in columns, according to the field
|
||||
// mapping and dictionary arrays stored in memo.
|
||||
// Columns may be sparse, i.e. some entries may be left null
|
||||
// (e.g. if an inclusion mask was used).
|
||||
ARROW_EXPORT
|
||||
Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo,
|
||||
MemoryPool* pool);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper
|
||||
// and all collected dictionaries are added to the memo using AddDictionary.
|
||||
//
|
||||
// This is used as a shortcut in some roundtripping tests (to avoid emitting
|
||||
// any actual dictionary batches).
|
||||
ARROW_EXPORT
|
||||
Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,150 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Public API for the "Feather" file format, originally created at
|
||||
// http://github.com/wesm/feather
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/ipc/options.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compression.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Schema;
|
||||
class Status;
|
||||
class Table;
|
||||
|
||||
namespace io {
|
||||
|
||||
class OutputStream;
|
||||
class RandomAccessFile;
|
||||
|
||||
} // namespace io
|
||||
|
||||
namespace ipc {
|
||||
namespace feather {
|
||||
|
||||
static constexpr const int kFeatherV1Version = 2;
|
||||
static constexpr const int kFeatherV2Version = 3;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Metadata accessor classes
|
||||
|
||||
/// \class Reader
|
||||
/// \brief An interface for reading columns from Feather files
|
||||
class ARROW_EXPORT Reader {
|
||||
public:
|
||||
virtual ~Reader() = default;
|
||||
|
||||
/// \brief Open a Feather file from a RandomAccessFile interface
|
||||
///
|
||||
/// \param[in] source a RandomAccessFile instance
|
||||
/// \return the table reader
|
||||
static Result<std::shared_ptr<Reader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& source);
|
||||
|
||||
/// \brief Open a Feather file from a RandomAccessFile interface
|
||||
/// with IPC Read options
|
||||
///
|
||||
/// \param[in] source a RandomAccessFile instance
|
||||
/// \param[in] options IPC Read options
|
||||
/// \return the table reader
|
||||
static Result<std::shared_ptr<Reader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& source, const IpcReadOptions& options);
|
||||
|
||||
/// \brief Return the version number of the Feather file
|
||||
virtual int version() const = 0;
|
||||
|
||||
virtual std::shared_ptr<Schema> schema() const = 0;
|
||||
|
||||
/// \brief Read all columns from the file as an arrow::Table.
|
||||
///
|
||||
/// \param[out] out the returned table
|
||||
/// \return Status
|
||||
///
|
||||
/// This function is zero-copy if the file source supports zero-copy reads
|
||||
virtual Status Read(std::shared_ptr<Table>* out) = 0;
|
||||
|
||||
/// \brief Read only the specified columns from the file as an arrow::Table.
|
||||
///
|
||||
/// \param[in] indices the column indices to read
|
||||
/// \param[out] out the returned table
|
||||
/// \return Status
|
||||
///
|
||||
/// This function is zero-copy if the file source supports zero-copy reads
|
||||
virtual Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) = 0;
|
||||
|
||||
/// \brief Read only the specified columns from the file as an arrow::Table.
|
||||
///
|
||||
/// \param[in] names the column names to read
|
||||
/// \param[out] out the returned table
|
||||
/// \return Status
|
||||
///
|
||||
/// This function is zero-copy if the file source supports zero-copy reads
|
||||
virtual Status Read(const std::vector<std::string>& names,
|
||||
std::shared_ptr<Table>* out) = 0;
|
||||
};
|
||||
|
||||
struct ARROW_EXPORT WriteProperties {
|
||||
static WriteProperties Defaults();
|
||||
|
||||
static WriteProperties DefaultsV1() {
|
||||
WriteProperties props = Defaults();
|
||||
props.version = kFeatherV1Version;
|
||||
return props;
|
||||
}
|
||||
|
||||
/// Feather file version number
|
||||
///
|
||||
/// version 2: "Feather V1" Apache Arrow <= 0.16.0
|
||||
/// version 3: "Feather V2" Apache Arrow > 0.16.0
|
||||
int version = kFeatherV2Version;
|
||||
|
||||
// Parameters for Feather V2 only
|
||||
|
||||
/// Number of rows per intra-file chunk. Use smaller chunksize when you need
|
||||
/// faster random row access
|
||||
int64_t chunksize = 1LL << 16;
|
||||
|
||||
/// Compression type to use. Only UNCOMPRESSED, LZ4_FRAME, and ZSTD are
|
||||
/// supported. The default compression returned by Defaults() is LZ4 if the
|
||||
/// project is built with support for it, otherwise
|
||||
/// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if
|
||||
/// WriteProperties::Defaults() is not used, the default constructor for
|
||||
/// WriteProperties will work regardless of the options used to build the C++
|
||||
/// project.
|
||||
Compression::type compression = Compression::UNCOMPRESSED;
|
||||
|
||||
/// Compressor-specific compression level
|
||||
int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
|
||||
};
|
||||
|
||||
ARROW_EXPORT
|
||||
Status WriteTable(const Table& table, io::OutputStream* dst,
|
||||
const WriteProperties& properties = WriteProperties::Defaults());
|
||||
|
||||
} // namespace feather
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,71 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Implement a simple JSON representation format for arrays
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class DataType;
|
||||
|
||||
namespace ipc {
|
||||
namespace internal {
|
||||
namespace json {
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ArrayFromJSON(const std::shared_ptr<DataType>&,
|
||||
const std::string& json);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ArrayFromJSON(const std::shared_ptr<DataType>&,
|
||||
std::string_view json);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> ArrayFromJSON(const std::shared_ptr<DataType>&,
|
||||
const char* json);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ChunkedArrayFromJSON(const std::shared_ptr<DataType>& type,
|
||||
const std::vector<std::string>& json_strings,
|
||||
std::shared_ptr<ChunkedArray>* out);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status DictArrayFromJSON(const std::shared_ptr<DataType>&, std::string_view indices_json,
|
||||
std::string_view dictionary_json, std::shared_ptr<Array>* out);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ScalarFromJSON(const std::shared_ptr<DataType>&, std::string_view json,
|
||||
std::shared_ptr<Scalar>* out);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status DictScalarFromJSON(const std::shared_ptr<DataType>&, std::string_view index_json,
|
||||
std::string_view dictionary_json, std::shared_ptr<Scalar>* out);
|
||||
|
||||
} // namespace json
|
||||
} // namespace internal
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,565 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// C++ object model and user API for interprocess schema messaging
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/io/type_fwd.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
|
||||
struct IpcWriteOptions;
|
||||
|
||||
// Read interface classes. We do not fully deserialize the flatbuffers so that
|
||||
// individual fields metadata can be retrieved from very large schema without
|
||||
//
|
||||
|
||||
/// \class Message
|
||||
/// \brief An IPC message including metadata and body
|
||||
class ARROW_EXPORT Message {
|
||||
public:
|
||||
/// \brief Construct message, but do not validate
|
||||
///
|
||||
/// Use at your own risk; Message::Open has more metadata validation
|
||||
Message(std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
|
||||
|
||||
~Message();
|
||||
|
||||
/// \brief Create and validate a Message instance from two buffers
|
||||
///
|
||||
/// \param[in] metadata a buffer containing the Flatbuffer metadata
|
||||
/// \param[in] body a buffer containing the message body, which may be null
|
||||
/// \return the created message
|
||||
static Result<std::unique_ptr<Message>> Open(std::shared_ptr<Buffer> metadata,
|
||||
std::shared_ptr<Buffer> body);
|
||||
|
||||
/// \brief Read message body and create Message given Flatbuffer metadata
|
||||
/// \param[in] metadata containing a serialized Message flatbuffer
|
||||
/// \param[in] stream an InputStream
|
||||
/// \return the created Message
|
||||
///
|
||||
/// \note If stream supports zero-copy, this is zero-copy
|
||||
static Result<std::unique_ptr<Message>> ReadFrom(std::shared_ptr<Buffer> metadata,
|
||||
io::InputStream* stream);
|
||||
|
||||
/// \brief Read message body from position in file, and create Message given
|
||||
/// the Flatbuffer metadata
|
||||
/// \param[in] offset the position in the file where the message body starts.
|
||||
/// \param[in] metadata containing a serialized Message flatbuffer
|
||||
/// \param[in] file the seekable file interface to read from
|
||||
/// \return the created Message
|
||||
///
|
||||
/// \note If file supports zero-copy, this is zero-copy
|
||||
static Result<std::unique_ptr<Message>> ReadFrom(const int64_t offset,
|
||||
std::shared_ptr<Buffer> metadata,
|
||||
io::RandomAccessFile* file);
|
||||
|
||||
/// \brief Return true if message type and contents are equal
|
||||
///
|
||||
/// \param other another message
|
||||
/// \return true if contents equal
|
||||
bool Equals(const Message& other) const;
|
||||
|
||||
/// \brief the Message metadata
|
||||
///
|
||||
/// \return buffer
|
||||
std::shared_ptr<Buffer> metadata() const;
|
||||
|
||||
/// \brief Custom metadata serialized in metadata Flatbuffer. Returns nullptr
|
||||
/// when none set
|
||||
const std::shared_ptr<const KeyValueMetadata>& custom_metadata() const;
|
||||
|
||||
/// \brief the Message body, if any
|
||||
///
|
||||
/// \return buffer is null if no body
|
||||
std::shared_ptr<Buffer> body() const;
|
||||
|
||||
/// \brief The expected body length according to the metadata, for
|
||||
/// verification purposes
|
||||
int64_t body_length() const;
|
||||
|
||||
/// \brief The Message type
|
||||
MessageType type() const;
|
||||
|
||||
/// \brief The Message metadata version
|
||||
MetadataVersion metadata_version() const;
|
||||
|
||||
const void* header() const;
|
||||
|
||||
/// \brief Write length-prefixed metadata and body to output stream
|
||||
///
|
||||
/// \param[in] file output stream to write to
|
||||
/// \param[in] options IPC writing options including alignment
|
||||
/// \param[out] output_length the number of bytes written
|
||||
/// \return Status
|
||||
Status SerializeTo(io::OutputStream* file, const IpcWriteOptions& options,
|
||||
int64_t* output_length) const;
|
||||
|
||||
/// \brief Return true if the Message metadata passes Flatbuffer validation
|
||||
bool Verify() const;
|
||||
|
||||
/// \brief Whether a given message type needs a body.
|
||||
static bool HasBody(MessageType type) {
|
||||
return type != MessageType::NONE && type != MessageType::SCHEMA;
|
||||
}
|
||||
|
||||
private:
|
||||
// Hide serialization details from user API
|
||||
class MessageImpl;
|
||||
std::unique_ptr<MessageImpl> impl_;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Message);
|
||||
};
|
||||
|
||||
ARROW_EXPORT std::string FormatMessageType(MessageType type);
|
||||
|
||||
/// \class MessageDecoderListener
|
||||
/// \brief An abstract class to listen events from MessageDecoder.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT MessageDecoderListener {
|
||||
public:
|
||||
virtual ~MessageDecoderListener() = default;
|
||||
|
||||
/// \brief Called when a message is decoded.
|
||||
///
|
||||
/// MessageDecoder calls this method when it decodes a message. This
|
||||
/// method is called multiple times when the target stream has
|
||||
/// multiple messages.
|
||||
///
|
||||
/// \param[in] message a decoded message
|
||||
/// \return Status
|
||||
virtual Status OnMessageDecoded(std::unique_ptr<Message> message) = 0;
|
||||
|
||||
/// \brief Called when the decoder state is changed to
|
||||
/// MessageDecoder::State::INITIAL.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status OnInitial();
|
||||
|
||||
/// \brief Called when the decoder state is changed to
|
||||
/// MessageDecoder::State::METADATA_LENGTH.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status OnMetadataLength();
|
||||
|
||||
/// \brief Called when the decoder state is changed to
|
||||
/// MessageDecoder::State::METADATA.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status OnMetadata();
|
||||
|
||||
/// \brief Called when the decoder state is changed to
|
||||
/// MessageDecoder::State::BODY.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status OnBody();
|
||||
|
||||
/// \brief Called when the decoder state is changed to
|
||||
/// MessageDecoder::State::EOS.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status OnEOS();
|
||||
};
|
||||
|
||||
/// \class AssignMessageDecoderListener
|
||||
/// \brief Assign a message decoded by MessageDecoder.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT AssignMessageDecoderListener : public MessageDecoderListener {
|
||||
public:
|
||||
/// \brief Construct a listener that assigns a decoded message to the
|
||||
/// specified location.
|
||||
///
|
||||
/// \param[in] message a location to store the received message
|
||||
explicit AssignMessageDecoderListener(std::unique_ptr<Message>* message)
|
||||
: message_(message) {}
|
||||
|
||||
virtual ~AssignMessageDecoderListener() = default;
|
||||
|
||||
Status OnMessageDecoded(std::unique_ptr<Message> message) override {
|
||||
*message_ = std::move(message);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
std::unique_ptr<Message>* message_;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(AssignMessageDecoderListener);
|
||||
};
|
||||
|
||||
/// \class MessageDecoder
|
||||
/// \brief Push style message decoder that receives data from user.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT MessageDecoder {
|
||||
public:
|
||||
/// \brief State for reading a message
|
||||
enum State {
|
||||
/// The initial state. It requires one of the followings as the next data:
|
||||
///
|
||||
/// * int32_t continuation token
|
||||
/// * int32_t end-of-stream mark (== 0)
|
||||
/// * int32_t metadata length (backward compatibility for
|
||||
/// reading old IPC messages produced prior to version 0.15.0
|
||||
INITIAL,
|
||||
|
||||
/// It requires int32_t metadata length.
|
||||
METADATA_LENGTH,
|
||||
|
||||
/// It requires metadata.
|
||||
METADATA,
|
||||
|
||||
/// It requires message body.
|
||||
BODY,
|
||||
|
||||
/// The end-of-stream state. No more data is processed.
|
||||
EOS,
|
||||
};
|
||||
|
||||
/// \brief Construct a message decoder.
|
||||
///
|
||||
/// \param[in] listener a MessageDecoderListener that responds events from
|
||||
/// the decoder
|
||||
/// \param[in] pool an optional MemoryPool to copy metadata on the
|
||||
/// \param[in] skip_body if true the body will be skipped even if the message has a body
|
||||
/// CPU, if required
|
||||
explicit MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
bool skip_body = false);
|
||||
|
||||
/// \brief Construct a message decoder with the specified state.
|
||||
///
|
||||
/// This is a construct for advanced users that know how to decode
|
||||
/// Message.
|
||||
///
|
||||
/// \param[in] listener a MessageDecoderListener that responds events from
|
||||
/// the decoder
|
||||
/// \param[in] initial_state an initial state of the decode
|
||||
/// \param[in] initial_next_required_size the number of bytes needed
|
||||
/// to run the next action
|
||||
/// \param[in] pool an optional MemoryPool to copy metadata on the
|
||||
/// CPU, if required
|
||||
/// \param[in] skip_body if true the body will be skipped even if the message has a body
|
||||
MessageDecoder(std::shared_ptr<MessageDecoderListener> listener, State initial_state,
|
||||
int64_t initial_next_required_size,
|
||||
MemoryPool* pool = default_memory_pool(), bool skip_body = false);
|
||||
|
||||
virtual ~MessageDecoder();
|
||||
|
||||
/// \brief Feed data to the decoder as a raw data.
|
||||
///
|
||||
/// If the decoder can decode one or more messages by the data, the
|
||||
/// decoder calls listener->OnMessageDecoded() with a decoded
|
||||
/// message multiple times.
|
||||
///
|
||||
/// If the state of the decoder is changed, corresponding callbacks
|
||||
/// on listener is called:
|
||||
///
|
||||
/// * MessageDecoder::State::INITIAL: listener->OnInitial()
|
||||
/// * MessageDecoder::State::METADATA_LENGTH: listener->OnMetadataLength()
|
||||
/// * MessageDecoder::State::METADATA: listener->OnMetadata()
|
||||
/// * MessageDecoder::State::BODY: listener->OnBody()
|
||||
/// * MessageDecoder::State::EOS: listener->OnEOS()
|
||||
///
|
||||
/// \param[in] data a raw data to be processed. This data isn't
|
||||
/// copied. The passed memory must be kept alive through message
|
||||
/// processing.
|
||||
/// \param[in] size raw data size.
|
||||
/// \return Status
|
||||
Status Consume(const uint8_t* data, int64_t size);
|
||||
|
||||
/// \brief Feed data to the decoder as a Buffer.
|
||||
///
|
||||
/// If the decoder can decode one or more messages by the Buffer,
|
||||
/// the decoder calls listener->OnMessageDecoded() with a decoded
|
||||
/// message multiple times.
|
||||
///
|
||||
/// \param[in] buffer a Buffer to be processed.
|
||||
/// \return Status
|
||||
Status Consume(std::shared_ptr<Buffer> buffer);
|
||||
|
||||
/// \brief Return the number of bytes needed to advance the state of
|
||||
/// the decoder.
|
||||
///
|
||||
/// This method is provided for users who want to optimize performance.
|
||||
/// Normal users don't need to use this method.
|
||||
///
|
||||
/// Here is an example usage for normal users:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// decoder.Consume(buffer1);
|
||||
/// decoder.Consume(buffer2);
|
||||
/// decoder.Consume(buffer3);
|
||||
/// ~~~
|
||||
///
|
||||
/// Decoder has internal buffer. If consumed data isn't enough to
|
||||
/// advance the state of the decoder, consumed data is buffered to
|
||||
/// the internal buffer. It causes performance overhead.
|
||||
///
|
||||
/// If you pass next_required_size() size data to each Consume()
|
||||
/// call, the decoder doesn't use its internal buffer. It improves
|
||||
/// performance.
|
||||
///
|
||||
/// Here is an example usage to avoid using internal buffer:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// buffer1 = get_data(decoder.next_required_size());
|
||||
/// decoder.Consume(buffer1);
|
||||
/// buffer2 = get_data(decoder.next_required_size());
|
||||
/// decoder.Consume(buffer2);
|
||||
/// ~~~
|
||||
///
|
||||
/// Users can use this method to avoid creating small
|
||||
/// chunks. Message body must be contiguous data. If users pass
|
||||
/// small chunks to the decoder, the decoder needs concatenate small
|
||||
/// chunks internally. It causes performance overhead.
|
||||
///
|
||||
/// Here is an example usage to reduce small chunks:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// buffer = AllocateResizableBuffer();
|
||||
/// while ((small_chunk = get_data(&small_chunk_size))) {
|
||||
/// auto current_buffer_size = buffer->size();
|
||||
/// buffer->Resize(current_buffer_size + small_chunk_size);
|
||||
/// memcpy(buffer->mutable_data() + current_buffer_size,
|
||||
/// small_chunk,
|
||||
/// small_chunk_size);
|
||||
/// if (buffer->size() < decoder.next_required_size()) {
|
||||
/// continue;
|
||||
/// }
|
||||
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
|
||||
/// decoder.Consume(chunk);
|
||||
/// buffer = AllocateResizableBuffer();
|
||||
/// }
|
||||
/// if (buffer->size() > 0) {
|
||||
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
|
||||
/// decoder.Consume(chunk);
|
||||
/// }
|
||||
/// ~~~
|
||||
///
|
||||
/// \return the number of bytes needed to advance the state of the
|
||||
/// decoder
|
||||
int64_t next_required_size() const;
|
||||
|
||||
/// \brief Return the current state of the decoder.
|
||||
///
|
||||
/// This method is provided for users who want to optimize performance.
|
||||
/// Normal users don't need to use this method.
|
||||
///
|
||||
/// Decoder doesn't need Buffer to process data on the
|
||||
/// MessageDecoder::State::INITIAL state and the
|
||||
/// MessageDecoder::State::METADATA_LENGTH. Creating Buffer has
|
||||
/// performance overhead. Advanced users can avoid creating Buffer
|
||||
/// by checking the current state of the decoder:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// switch (decoder.state()) {
|
||||
/// MessageDecoder::State::INITIAL:
|
||||
/// MessageDecoder::State::METADATA_LENGTH:
|
||||
/// {
|
||||
/// uint8_t data[sizeof(int32_t)];
|
||||
/// auto data_size = input->Read(decoder.next_required_size(), data);
|
||||
/// decoder.Consume(data, data_size);
|
||||
/// }
|
||||
/// break;
|
||||
/// default:
|
||||
/// {
|
||||
/// auto buffer = input->Read(decoder.next_required_size());
|
||||
/// decoder.Consume(buffer);
|
||||
/// }
|
||||
/// break;
|
||||
/// }
|
||||
/// ~~~
|
||||
///
|
||||
/// \return the current state
|
||||
State state() const;
|
||||
|
||||
private:
|
||||
class MessageDecoderImpl;
|
||||
std::unique_ptr<MessageDecoderImpl> impl_;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoder);
|
||||
};
|
||||
|
||||
/// \brief Abstract interface for a sequence of messages
|
||||
/// \since 0.5.0
|
||||
class ARROW_EXPORT MessageReader {
|
||||
public:
|
||||
virtual ~MessageReader() = default;
|
||||
|
||||
/// \brief Create MessageReader that reads from InputStream
|
||||
static std::unique_ptr<MessageReader> Open(io::InputStream* stream);
|
||||
|
||||
/// \brief Create MessageReader that reads from owned InputStream
|
||||
static std::unique_ptr<MessageReader> Open(
|
||||
const std::shared_ptr<io::InputStream>& owned_stream);
|
||||
|
||||
/// \brief Read next Message from the interface
|
||||
///
|
||||
/// \return an arrow::ipc::Message instance
|
||||
virtual Result<std::unique_ptr<Message>> ReadNextMessage() = 0;
|
||||
};
|
||||
|
||||
// the first parameter of the function should be a pointer to metadata (aka.
|
||||
// org::apache::arrow::flatbuf::RecordBatch*)
|
||||
using FieldsLoaderFunction = std::function<Status(const void*, io::RandomAccessFile*)>;
|
||||
|
||||
/// \brief Read encapsulated RPC message from position in file
|
||||
///
|
||||
/// Read a length-prefixed message flatbuffer starting at the indicated file
|
||||
/// offset. If the message has a body with non-zero length, it will also be
|
||||
/// read
|
||||
///
|
||||
/// The metadata_length includes at least the length prefix and the flatbuffer
|
||||
///
|
||||
/// \param[in] offset the position in the file where the message starts. The
|
||||
/// first 4 bytes after the offset are the message length
|
||||
/// \param[in] metadata_length the total number of bytes to read from file
|
||||
/// \param[in] file the seekable file interface to read from
|
||||
/// \param[in] fields_loader the function for loading subset of fields from the given file
|
||||
/// \return the message read
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Message>> ReadMessage(
|
||||
const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file,
|
||||
const FieldsLoaderFunction& fields_loader = {});
|
||||
|
||||
/// \brief Read encapsulated RPC message from cached buffers
|
||||
///
|
||||
/// The buffers should contain an entire message. Partial reads are not handled.
|
||||
///
|
||||
/// This method can be used to read just the metadata by passing in a nullptr for the
|
||||
/// body. The body will then be skipped and the body size will not be validated.
|
||||
///
|
||||
/// If the body buffer is provided then it must be the complete body buffer
|
||||
///
|
||||
/// This is similar to Message::Open but performs slightly more validation (e.g. checks
|
||||
/// to see that the metadata length is correct and that the body is the size the metadata
|
||||
/// expected)
|
||||
///
|
||||
/// \param metadata The bytes for the metadata
|
||||
/// \param body The bytes for the body
|
||||
/// \return The message represented by the buffers
|
||||
ARROW_EXPORT Result<std::unique_ptr<Message>> ReadMessage(
|
||||
std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
|
||||
|
||||
ARROW_EXPORT
|
||||
Future<std::shared_ptr<Message>> ReadMessageAsync(
|
||||
const int64_t offset, const int32_t metadata_length, const int64_t body_length,
|
||||
io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
|
||||
|
||||
/// \brief Advance stream to an 8-byte offset if its position is not a multiple
|
||||
/// of 8 already
|
||||
/// \param[in] stream an input stream
|
||||
/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
|
||||
/// or 64, to ensure the body starts on a multiple of that alignment
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status AlignStream(io::InputStream* stream, int32_t alignment = 8);
|
||||
|
||||
/// \brief Advance stream to an 8-byte offset if its position is not a multiple
|
||||
/// of 8 already
|
||||
/// \param[in] stream an output stream
|
||||
/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
|
||||
/// or 64, to ensure the body starts on a multiple of that alignment
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status AlignStream(io::OutputStream* stream, int32_t alignment = 8);
|
||||
|
||||
/// \brief Return error Status if file position is not a multiple of the
|
||||
/// indicated alignment
|
||||
ARROW_EXPORT
|
||||
Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8);
|
||||
|
||||
/// \brief Read encapsulated IPC message (metadata and body) from InputStream
|
||||
///
|
||||
/// Returns null if there are not enough bytes available or the
|
||||
/// message length is 0 (e.g. EOS in a stream)
|
||||
///
|
||||
/// \param[in] stream an input stream
|
||||
/// \param[in] pool an optional MemoryPool to copy metadata on the CPU, if required
|
||||
/// \return Message
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Message>> ReadMessage(io::InputStream* stream,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Feed data from InputStream to MessageDecoder to decode an
|
||||
/// encapsulated IPC message (metadata and body)
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \param[in] decoder a decoder
|
||||
/// \param[in] stream an input stream
|
||||
/// \return Status
|
||||
///
|
||||
/// \since 0.17.0
|
||||
ARROW_EXPORT
|
||||
Status DecodeMessage(MessageDecoder* decoder, io::InputStream* stream);
|
||||
|
||||
/// Write encapsulated IPC message Does not make assumptions about
|
||||
/// whether the stream is aligned already. Can write legacy (pre
|
||||
/// version 0.15.0) IPC message if option set
|
||||
///
|
||||
/// continuation: 0xFFFFFFFF
|
||||
/// message_size: int32
|
||||
/// message: const void*
|
||||
/// padding
|
||||
///
|
||||
///
|
||||
/// \param[in] message a buffer containing the metadata to write
|
||||
/// \param[in] options IPC writing options, including alignment and
|
||||
/// legacy message support
|
||||
/// \param[in,out] file the OutputStream to write to
|
||||
/// \param[out] message_length the total size of the payload written including
|
||||
/// padding
|
||||
/// \return Status
|
||||
Status WriteMessage(const Buffer& message, const IpcWriteOptions& options,
|
||||
io::OutputStream* file, int32_t* message_length);
|
||||
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,162 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/caching.h"
|
||||
#include "arrow/ipc/type_fwd.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/compression.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class MemoryPool;
|
||||
|
||||
namespace ipc {
|
||||
|
||||
// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
|
||||
// deeply nested schemas, it is expected the user will indicate explicitly the
|
||||
// maximum allowed recursion depth
|
||||
constexpr int kMaxNestingDepth = 64;
|
||||
|
||||
/// \brief Options for writing Arrow IPC messages
|
||||
struct ARROW_EXPORT IpcWriteOptions {
|
||||
/// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
|
||||
///
|
||||
/// Some implementations may not be able to parse streams created with this option.
|
||||
bool allow_64bit = false;
|
||||
|
||||
/// \brief The maximum permitted schema nesting depth.
|
||||
int max_recursion_depth = kMaxNestingDepth;
|
||||
|
||||
/// \brief Write padding after memory buffers up to this multiple of bytes.
|
||||
int32_t alignment = 8;
|
||||
|
||||
/// \brief Write the pre-0.15.0 IPC message format
|
||||
///
|
||||
/// This legacy format consists of a 4-byte prefix instead of 8-byte.
|
||||
bool write_legacy_ipc_format = false;
|
||||
|
||||
/// \brief The memory pool to use for allocations made during IPC writing
|
||||
///
|
||||
/// While Arrow IPC is predominantly zero-copy, it may have to allocate
|
||||
/// memory in some cases (for example if compression is enabled).
|
||||
MemoryPool* memory_pool = default_memory_pool();
|
||||
|
||||
/// \brief Compression codec to use for record batch body buffers
|
||||
///
|
||||
/// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
|
||||
std::shared_ptr<util::Codec> codec;
|
||||
|
||||
/// \brief Use global CPU thread pool to parallelize any computational tasks
|
||||
/// like compression
|
||||
bool use_threads = true;
|
||||
|
||||
/// \brief Whether to emit dictionary deltas
|
||||
///
|
||||
/// If false, a changed dictionary for a given field will emit a full
|
||||
/// dictionary replacement.
|
||||
/// If true, a changed dictionary will be compared against the previous
|
||||
/// version. If possible, a dictionary delta will be emitted, otherwise
|
||||
/// a full dictionary replacement.
|
||||
///
|
||||
/// Default is false to maximize stream compatibility.
|
||||
///
|
||||
/// Also, note that if a changed dictionary is a nested dictionary,
|
||||
/// then a delta is never emitted, for compatibility with the read path.
|
||||
bool emit_dictionary_deltas = false;
|
||||
|
||||
/// \brief Whether to unify dictionaries for the IPC file format
|
||||
///
|
||||
/// The IPC file format doesn't support dictionary replacements.
|
||||
/// Therefore, chunks of a column with a dictionary type must have the same
|
||||
/// dictionary in each record batch (or an extended dictionary + delta).
|
||||
///
|
||||
/// If this option is true, RecordBatchWriter::WriteTable will attempt
|
||||
/// to unify dictionaries across each table column. If this option is
|
||||
/// false, incompatible dictionaries across a table column will simply
|
||||
/// raise an error.
|
||||
///
|
||||
/// Note that enabling this option has a runtime cost. Also, not all types
|
||||
/// currently support dictionary unification.
|
||||
///
|
||||
/// This option is ignored for IPC streams, which support dictionary replacement
|
||||
/// and deltas.
|
||||
bool unify_dictionaries = false;
|
||||
|
||||
/// \brief Format version to use for IPC messages and their metadata.
|
||||
///
|
||||
/// Presently using V5 version (readable by 1.0.0 and later).
|
||||
/// V4 is also available (readable by 0.8.0 and later).
|
||||
MetadataVersion metadata_version = MetadataVersion::V5;
|
||||
|
||||
static IpcWriteOptions Defaults();
|
||||
};
|
||||
|
||||
/// \brief Options for reading Arrow IPC messages
|
||||
struct ARROW_EXPORT IpcReadOptions {
|
||||
/// \brief The maximum permitted schema nesting depth.
|
||||
int max_recursion_depth = kMaxNestingDepth;
|
||||
|
||||
/// \brief The memory pool to use for allocations made during IPC reading
|
||||
///
|
||||
/// While Arrow IPC is predominantly zero-copy, it may have to allocate
|
||||
/// memory in some cases (for example if compression is enabled).
|
||||
MemoryPool* memory_pool = default_memory_pool();
|
||||
|
||||
/// \brief Top-level schema fields to include when deserializing RecordBatch.
|
||||
///
|
||||
/// If empty (the default), return all deserialized fields.
|
||||
/// If non-empty, the values are the indices of fields in the top-level schema.
|
||||
std::vector<int> included_fields;
|
||||
|
||||
/// \brief Use global CPU thread pool to parallelize any computational tasks
|
||||
/// like decompression
|
||||
bool use_threads = true;
|
||||
|
||||
/// \brief Whether to convert incoming data to platform-native endianness
|
||||
///
|
||||
/// If the endianness of the received schema is not equal to platform-native
|
||||
/// endianness, then all buffers with endian-sensitive data will be byte-swapped.
|
||||
/// This includes the value buffers of numeric types, temporal types, decimal
|
||||
/// types, as well as the offset buffers of variable-sized binary and list-like
|
||||
/// types.
|
||||
///
|
||||
/// Endianness conversion is achieved by the RecordBatchFileReader,
|
||||
/// RecordBatchStreamReader and StreamDecoder classes.
|
||||
bool ensure_native_endian = true;
|
||||
|
||||
/// \brief Options to control caching behavior when pre-buffering is requested
|
||||
///
|
||||
/// The lazy property will always be reset to true to deliver the expected behavior
|
||||
io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();
|
||||
|
||||
static IpcReadOptions Defaults();
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
Status CheckCompressionSupported(Compression::type codec);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,555 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Read Arrow files and streams
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/caching.h"
|
||||
#include "arrow/io/type_fwd.h"
|
||||
#include "arrow/ipc/message.h"
|
||||
#include "arrow/ipc/options.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/async_generator.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
|
||||
class DictionaryMemo;
|
||||
struct IpcPayload;
|
||||
|
||||
using RecordBatchReader = ::arrow::RecordBatchReader;
|
||||
|
||||
struct ReadStats {
|
||||
/// Number of IPC messages read.
|
||||
int64_t num_messages = 0;
|
||||
/// Number of record batches read.
|
||||
int64_t num_record_batches = 0;
|
||||
/// Number of dictionary batches read.
|
||||
///
|
||||
/// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
|
||||
int64_t num_dictionary_batches = 0;
|
||||
|
||||
/// Number of dictionary deltas read.
|
||||
int64_t num_dictionary_deltas = 0;
|
||||
/// Number of replaced dictionaries (i.e. where a dictionary batch replaces
|
||||
/// an existing dictionary with an unrelated new dictionary).
|
||||
int64_t num_replaced_dictionaries = 0;
|
||||
};
|
||||
|
||||
/// \brief Synchronous batch stream reader that reads from io::InputStream
|
||||
///
|
||||
/// This class reads the schema (plus any dictionaries) as the first messages
|
||||
/// in the stream, followed by record batches. For more granular zero-copy
|
||||
/// reads see the ReadRecordBatch functions
|
||||
class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader {
|
||||
public:
|
||||
/// Create batch reader from generic MessageReader.
|
||||
/// This will take ownership of the given MessageReader.
|
||||
///
|
||||
/// \param[in] message_reader a MessageReader implementation
|
||||
/// \param[in] options any IPC reading options (optional)
|
||||
/// \return the created batch reader
|
||||
static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
|
||||
std::unique_ptr<MessageReader> message_reader,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Record batch stream reader from InputStream
|
||||
///
|
||||
/// \param[in] stream an input stream instance. Must stay alive throughout
|
||||
/// lifetime of stream reader
|
||||
/// \param[in] options any IPC reading options (optional)
|
||||
/// \return the created batch reader
|
||||
static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
|
||||
io::InputStream* stream,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open stream and retain ownership of stream object
|
||||
/// \param[in] stream the input stream
|
||||
/// \param[in] options any IPC reading options (optional)
|
||||
/// \return the created batch reader
|
||||
static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
|
||||
const std::shared_ptr<io::InputStream>& stream,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Return current read statistics
|
||||
virtual ReadStats stats() const = 0;
|
||||
};
|
||||
|
||||
/// \brief Reads the record batch file format
|
||||
class ARROW_EXPORT RecordBatchFileReader
|
||||
: public std::enable_shared_from_this<RecordBatchFileReader> {
|
||||
public:
|
||||
virtual ~RecordBatchFileReader() = default;
|
||||
|
||||
/// \brief Open a RecordBatchFileReader
|
||||
///
|
||||
/// Open a file-like object that is assumed to be self-contained; i.e., the
|
||||
/// end of the file interface is the end of the Arrow file. Note that there
|
||||
/// can be any amount of data preceding the Arrow-formatted data, because we
|
||||
/// need only locate the end of the Arrow file stream to discover the metadata
|
||||
/// and then proceed to read the data into memory.
|
||||
static Result<std::shared_ptr<RecordBatchFileReader>> Open(
|
||||
io::RandomAccessFile* file,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open a RecordBatchFileReader
|
||||
/// If the file is embedded within some larger file or memory region, you can
|
||||
/// pass the absolute memory offset to the end of the file (which contains the
|
||||
/// metadata footer). The metadata must have been written with memory offsets
|
||||
/// relative to the start of the containing file
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] footer_offset the position of the end of the Arrow file
|
||||
/// \param[in] options options for IPC reading
|
||||
/// \return the returned reader
|
||||
static Result<std::shared_ptr<RecordBatchFileReader>> Open(
|
||||
io::RandomAccessFile* file, int64_t footer_offset,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Version of Open that retains ownership of file
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] options options for IPC reading
|
||||
/// \return the returned reader
|
||||
static Result<std::shared_ptr<RecordBatchFileReader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Version of Open that retains ownership of file
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] footer_offset the position of the end of the Arrow file
|
||||
/// \param[in] options options for IPC reading
|
||||
/// \return the returned reader
|
||||
static Result<std::shared_ptr<RecordBatchFileReader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open a file asynchronously (owns the file).
|
||||
static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open a file asynchronously (borrows the file).
|
||||
static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
|
||||
io::RandomAccessFile* file,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open a file asynchronously (owns the file).
|
||||
static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief Open a file asynchronously (borrows the file).
|
||||
static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
|
||||
io::RandomAccessFile* file, int64_t footer_offset,
|
||||
const IpcReadOptions& options = IpcReadOptions::Defaults());
|
||||
|
||||
/// \brief The schema read from the file
|
||||
virtual std::shared_ptr<Schema> schema() const = 0;
|
||||
|
||||
/// \brief Returns the number of record batches in the file
|
||||
virtual int num_record_batches() const = 0;
|
||||
|
||||
/// \brief Return the metadata version from the file metadata
|
||||
virtual MetadataVersion version() const = 0;
|
||||
|
||||
/// \brief Return the contents of the custom_metadata field from the file's
|
||||
/// Footer
|
||||
virtual std::shared_ptr<const KeyValueMetadata> metadata() const = 0;
|
||||
|
||||
/// \brief Read a particular record batch from the file. Does not copy memory
|
||||
/// if the input source supports zero-copy.
|
||||
///
|
||||
/// \param[in] i the index of the record batch to return
|
||||
/// \return the read batch
|
||||
virtual Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) = 0;
|
||||
|
||||
/// \brief Read a particular record batch along with its custom metadada from the file.
|
||||
/// Does not copy memory if the input source supports zero-copy.
|
||||
///
|
||||
/// \param[in] i the index of the record batch to return
|
||||
/// \return a struct containing the read batch and its custom metadata
|
||||
virtual Result<RecordBatchWithMetadata> ReadRecordBatchWithCustomMetadata(int i) = 0;
|
||||
|
||||
/// \brief Return current read statistics
|
||||
virtual ReadStats stats() const = 0;
|
||||
|
||||
/// \brief Computes the total number of rows in the file.
|
||||
virtual Result<int64_t> CountRows() = 0;
|
||||
|
||||
/// \brief Begin loading metadata for the desired batches into memory.
|
||||
///
|
||||
/// This method will also begin loading all dictionaries messages into memory.
|
||||
///
|
||||
/// For a regular file this will immediately begin disk I/O in the background on a
|
||||
/// thread on the IOContext's thread pool. If the file is memory mapped this will
|
||||
/// ensure the memory needed for the metadata is paged from disk into memory
|
||||
///
|
||||
/// \param indices Indices of the batches to prefetch
|
||||
/// If empty then all batches will be prefetched.
|
||||
virtual Status PreBufferMetadata(const std::vector<int>& indices) = 0;
|
||||
|
||||
/// \brief Get a reentrant generator of record batches.
|
||||
///
|
||||
/// \param[in] coalesce If true, enable I/O coalescing.
|
||||
/// \param[in] io_context The IOContext to use (controls which thread pool
|
||||
/// is used for I/O).
|
||||
/// \param[in] cache_options Options for coalescing (if enabled).
|
||||
/// \param[in] executor Optionally, an executor to use for decoding record
|
||||
/// batches. This is generally only a benefit for very wide and/or
|
||||
/// compressed batches.
|
||||
virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
|
||||
const bool coalesce = false,
|
||||
const io::IOContext& io_context = io::default_io_context(),
|
||||
const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
|
||||
arrow::internal::Executor* executor = NULLPTR) = 0;
|
||||
};
|
||||
|
||||
/// \brief A general listener class to receive events.
|
||||
///
|
||||
/// You must implement callback methods for interested events.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT Listener {
|
||||
public:
|
||||
virtual ~Listener() = default;
|
||||
|
||||
/// \brief Called when end-of-stream is received.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \return Status
|
||||
///
|
||||
/// \see StreamDecoder
|
||||
virtual Status OnEOS();
|
||||
|
||||
/// \brief Called when a record batch is decoded.
|
||||
///
|
||||
/// The default implementation just returns
|
||||
/// arrow::Status::NotImplemented().
|
||||
///
|
||||
/// \param[in] record_batch a record batch decoded
|
||||
/// \return Status
|
||||
///
|
||||
/// \see StreamDecoder
|
||||
virtual Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch);
|
||||
|
||||
/// \brief Called when a schema is decoded.
|
||||
///
|
||||
/// The default implementation just returns arrow::Status::OK().
|
||||
///
|
||||
/// \param[in] schema a schema decoded
|
||||
/// \return Status
|
||||
///
|
||||
/// \see StreamDecoder
|
||||
virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema);
|
||||
};
|
||||
|
||||
/// \brief Collect schema and record batches decoded by StreamDecoder.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT CollectListener : public Listener {
|
||||
public:
|
||||
CollectListener() : schema_(), record_batches_() {}
|
||||
virtual ~CollectListener() = default;
|
||||
|
||||
Status OnSchemaDecoded(std::shared_ptr<Schema> schema) override {
|
||||
schema_ = std::move(schema);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch) override {
|
||||
record_batches_.push_back(std::move(record_batch));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \return the decoded schema
|
||||
std::shared_ptr<Schema> schema() const { return schema_; }
|
||||
|
||||
/// \return the all decoded record batches
|
||||
std::vector<std::shared_ptr<RecordBatch>> record_batches() const {
|
||||
return record_batches_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<Schema> schema_;
|
||||
std::vector<std::shared_ptr<RecordBatch>> record_batches_;
|
||||
};
|
||||
|
||||
/// \brief Push style stream decoder that receives data from user.
|
||||
///
|
||||
/// This class decodes the Apache Arrow IPC streaming format data.
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
///
|
||||
/// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
|
||||
///
|
||||
/// \since 0.17.0
|
||||
class ARROW_EXPORT StreamDecoder {
|
||||
public:
|
||||
/// \brief Construct a stream decoder.
|
||||
///
|
||||
/// \param[in] listener a Listener that must implement
|
||||
/// Listener::OnRecordBatchDecoded() to receive decoded record batches
|
||||
/// \param[in] options any IPC reading options (optional)
|
||||
StreamDecoder(std::shared_ptr<Listener> listener,
|
||||
IpcReadOptions options = IpcReadOptions::Defaults());
|
||||
|
||||
virtual ~StreamDecoder();
|
||||
|
||||
/// \brief Feed data to the decoder as a raw data.
|
||||
///
|
||||
/// If the decoder can read one or more record batches by the data,
|
||||
/// the decoder calls listener->OnRecordBatchDecoded() with a
|
||||
/// decoded record batch multiple times.
|
||||
///
|
||||
/// \param[in] data a raw data to be processed. This data isn't
|
||||
/// copied. The passed memory must be kept alive through record
|
||||
/// batch processing.
|
||||
/// \param[in] size raw data size.
|
||||
/// \return Status
|
||||
Status Consume(const uint8_t* data, int64_t size);
|
||||
|
||||
/// \brief Feed data to the decoder as a Buffer.
|
||||
///
|
||||
/// If the decoder can read one or more record batches by the
|
||||
/// Buffer, the decoder calls listener->RecordBatchReceived() with a
|
||||
/// decoded record batch multiple times.
|
||||
///
|
||||
/// \param[in] buffer a Buffer to be processed.
|
||||
/// \return Status
|
||||
Status Consume(std::shared_ptr<Buffer> buffer);
|
||||
|
||||
/// \return the shared schema of the record batches in the stream
|
||||
std::shared_ptr<Schema> schema() const;
|
||||
|
||||
/// \brief Return the number of bytes needed to advance the state of
|
||||
/// the decoder.
|
||||
///
|
||||
/// This method is provided for users who want to optimize performance.
|
||||
/// Normal users don't need to use this method.
|
||||
///
|
||||
/// Here is an example usage for normal users:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// decoder.Consume(buffer1);
|
||||
/// decoder.Consume(buffer2);
|
||||
/// decoder.Consume(buffer3);
|
||||
/// ~~~
|
||||
///
|
||||
/// Decoder has internal buffer. If consumed data isn't enough to
|
||||
/// advance the state of the decoder, consumed data is buffered to
|
||||
/// the internal buffer. It causes performance overhead.
|
||||
///
|
||||
/// If you pass next_required_size() size data to each Consume()
|
||||
/// call, the decoder doesn't use its internal buffer. It improves
|
||||
/// performance.
|
||||
///
|
||||
/// Here is an example usage to avoid using internal buffer:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// buffer1 = get_data(decoder.next_required_size());
|
||||
/// decoder.Consume(buffer1);
|
||||
/// buffer2 = get_data(decoder.next_required_size());
|
||||
/// decoder.Consume(buffer2);
|
||||
/// ~~~
|
||||
///
|
||||
/// Users can use this method to avoid creating small chunks. Record
|
||||
/// batch data must be contiguous data. If users pass small chunks
|
||||
/// to the decoder, the decoder needs concatenate small chunks
|
||||
/// internally. It causes performance overhead.
|
||||
///
|
||||
/// Here is an example usage to reduce small chunks:
|
||||
///
|
||||
/// ~~~{.cpp}
|
||||
/// buffer = AllocateResizableBuffer();
|
||||
/// while ((small_chunk = get_data(&small_chunk_size))) {
|
||||
/// auto current_buffer_size = buffer->size();
|
||||
/// buffer->Resize(current_buffer_size + small_chunk_size);
|
||||
/// memcpy(buffer->mutable_data() + current_buffer_size,
|
||||
/// small_chunk,
|
||||
/// small_chunk_size);
|
||||
/// if (buffer->size() < decoder.next_required_size()) {
|
||||
/// continue;
|
||||
/// }
|
||||
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
|
||||
/// decoder.Consume(chunk);
|
||||
/// buffer = AllocateResizableBuffer();
|
||||
/// }
|
||||
/// if (buffer->size() > 0) {
|
||||
/// std::shared_ptr<arrow::Buffer> chunk(buffer.release());
|
||||
/// decoder.Consume(chunk);
|
||||
/// }
|
||||
/// ~~~
|
||||
///
|
||||
/// \return the number of bytes needed to advance the state of the
|
||||
/// decoder
|
||||
int64_t next_required_size() const;
|
||||
|
||||
/// \brief Return current read statistics
|
||||
ReadStats stats() const;
|
||||
|
||||
private:
|
||||
class StreamDecoderImpl;
|
||||
std::unique_ptr<StreamDecoderImpl> impl_;
|
||||
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(StreamDecoder);
|
||||
};
|
||||
|
||||
// Generic read functions; does not copy data if the input supports zero copy reads
|
||||
|
||||
/// \brief Read Schema from stream serialized as a single IPC message
|
||||
/// and populate any dictionary-encoded fields into a DictionaryMemo
|
||||
///
|
||||
/// \param[in] stream an InputStream
|
||||
/// \param[in] dictionary_memo for recording dictionary-encoded fields
|
||||
/// \return the output Schema
|
||||
///
|
||||
/// If record batches follow the schema, it is better to use
|
||||
/// RecordBatchStreamReader
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
|
||||
DictionaryMemo* dictionary_memo);
|
||||
|
||||
/// \brief Read Schema from encapsulated Message
|
||||
///
|
||||
/// \param[in] message the message containing the Schema IPC metadata
|
||||
/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded
|
||||
/// fields. Can be nullptr if you are sure there are no
|
||||
/// dictionary-encoded fields
|
||||
/// \return the resulting Schema
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Schema>> ReadSchema(const Message& message,
|
||||
DictionaryMemo* dictionary_memo);
|
||||
|
||||
/// Read record batch as encapsulated IPC message with metadata size prefix and
|
||||
/// header
|
||||
///
|
||||
/// \param[in] schema the record batch schema
|
||||
/// \param[in] dictionary_memo DictionaryMemo which has any
|
||||
/// dictionaries. Can be nullptr if you are sure there are no
|
||||
/// dictionary-encoded fields
|
||||
/// \param[in] options IPC options for reading
|
||||
/// \param[in] stream the file where the batch is located
|
||||
/// \return the read record batch
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
|
||||
const std::shared_ptr<Schema>& schema, const DictionaryMemo* dictionary_memo,
|
||||
const IpcReadOptions& options, io::InputStream* stream);
|
||||
|
||||
/// \brief Read record batch from message
|
||||
///
|
||||
/// \param[in] message a Message containing the record batch metadata
|
||||
/// \param[in] schema the record batch schema
|
||||
/// \param[in] dictionary_memo DictionaryMemo which has any
|
||||
/// dictionaries. Can be nullptr if you are sure there are no
|
||||
/// dictionary-encoded fields
|
||||
/// \param[in] options IPC options for reading
|
||||
/// \return the read record batch
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
|
||||
const Message& message, const std::shared_ptr<Schema>& schema,
|
||||
const DictionaryMemo* dictionary_memo, const IpcReadOptions& options);
|
||||
|
||||
/// Read record batch from file given metadata and schema
|
||||
///
|
||||
/// \param[in] metadata a Message containing the record batch metadata
|
||||
/// \param[in] schema the record batch schema
|
||||
/// \param[in] dictionary_memo DictionaryMemo which has any
|
||||
/// dictionaries. Can be nullptr if you are sure there are no
|
||||
/// dictionary-encoded fields
|
||||
/// \param[in] file a random access file
|
||||
/// \param[in] options options for deserialization
|
||||
/// \return the read record batch
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
|
||||
const Buffer& metadata, const std::shared_ptr<Schema>& schema,
|
||||
const DictionaryMemo* dictionary_memo, const IpcReadOptions& options,
|
||||
io::RandomAccessFile* file);
|
||||
|
||||
/// \brief Read arrow::Tensor as encapsulated IPC message in file
|
||||
///
|
||||
/// \param[in] file an InputStream pointed at the start of the message
|
||||
/// \return the read tensor
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Tensor>> ReadTensor(io::InputStream* file);
|
||||
|
||||
/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message
|
||||
///
|
||||
/// \param[in] message a Message containing the tensor metadata and body
|
||||
/// \return the read tensor
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Tensor>> ReadTensor(const Message& message);
|
||||
|
||||
/// \brief EXPERIMENTAL: Read arrow::SparseTensor as encapsulated IPC message in file
|
||||
///
|
||||
/// \param[in] file an InputStream pointed at the start of the message
|
||||
/// \return the read sparse tensor
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(io::InputStream* file);
|
||||
|
||||
/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message
|
||||
///
|
||||
/// \param[in] message a Message containing the tensor metadata and body
|
||||
/// \return the read sparse tensor
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(const Message& message);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// These internal APIs may change without warning or deprecation
|
||||
|
||||
/// \brief EXPERIMENTAL: Read arrow::SparseTensorFormat::type from a metadata
|
||||
/// \param[in] metadata a Buffer containing the sparse tensor metadata
|
||||
/// \return the count of the body buffers
|
||||
ARROW_EXPORT
|
||||
Result<size_t> ReadSparseTensorBodyBufferCount(const Buffer& metadata);
|
||||
|
||||
/// \brief EXPERIMENTAL: Read arrow::SparseTensor from an IpcPayload
|
||||
/// \param[in] payload a IpcPayload contains a serialized SparseTensor
|
||||
/// \return the read sparse tensor
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload& payload);
|
||||
|
||||
// For fuzzing targets
|
||||
ARROW_EXPORT
|
||||
Status FuzzIpcStream(const uint8_t* data, int64_t size);
|
||||
ARROW_EXPORT
|
||||
Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
|
||||
ARROW_EXPORT
|
||||
Status FuzzIpcFile(const uint8_t* data, int64_t size);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,180 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/testing/visibility.h"
|
||||
#include "arrow/type.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
namespace test {
|
||||
|
||||
// A typedef used for test parameterization
|
||||
typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void CompareArraysDetailed(int index, const Array& result, const Array& expected);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
void CompareBatchColumnsDetailed(const RecordBatch& result, const RecordBatch& expected);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool,
|
||||
std::shared_ptr<Array>* out, uint32_t seed = 0,
|
||||
int32_t min = 0, int32_t max = 1000);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomInt64Array(int64_t length, bool include_nulls, MemoryPool* pool,
|
||||
std::shared_ptr<Array>* out, uint32_t seed = 0);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
|
||||
bool include_nulls, MemoryPool* pool,
|
||||
std::shared_ptr<Array>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomLargeListArray(const std::shared_ptr<Array>& child_array, int num_lists,
|
||||
bool include_nulls, MemoryPool* pool,
|
||||
std::shared_ptr<Array>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomBooleanArray(const int length, bool include_nulls,
|
||||
std::shared_ptr<Array>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeBooleanBatchSized(const int length, std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeBooleanBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeIntBatchSized(int length, std::shared_ptr<RecordBatch>* out,
|
||||
uint32_t seed = 0);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFloat3264BatchSized(int length, std::shared_ptr<RecordBatch>* out,
|
||||
uint32_t seed = 0);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFloat3264Batch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFloatBatchSized(int length, std::shared_ptr<RecordBatch>* out,
|
||||
uint32_t seed = 0);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFloatBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* pool,
|
||||
std::shared_ptr<Array>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out,
|
||||
bool with_nulls = true);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFixedSizeListRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeZeroLengthRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeNonNullRecordBatch(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeStruct(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeUnion(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDictionary(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDictionaryFlat(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeNestedDictionary(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeMap(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeMapOfDictionary(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDates(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeTimestamps(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeIntervals(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeTimes(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeFWBinary(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDecimal(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeNull(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeUuid(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeComplex128(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeDictExtension(std::shared_ptr<RecordBatch>* out);
|
||||
|
||||
ARROW_TESTING_EXPORT
|
||||
Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
|
||||
const std::vector<int64_t>& shape, bool row_major_p,
|
||||
std::shared_ptr<Tensor>* out, uint32_t seed = 0);
|
||||
|
||||
} // namespace test
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,65 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
|
||||
enum class MetadataVersion : char {
|
||||
/// 0.1.0
|
||||
V1,
|
||||
|
||||
/// 0.2.0
|
||||
V2,
|
||||
|
||||
/// 0.3.0 to 0.7.1
|
||||
V3,
|
||||
|
||||
/// 0.8.0 to 0.17.0
|
||||
V4,
|
||||
|
||||
/// >= 1.0.0
|
||||
V5
|
||||
};
|
||||
|
||||
class Message;
|
||||
enum class MessageType {
|
||||
NONE,
|
||||
SCHEMA,
|
||||
DICTIONARY_BATCH,
|
||||
RECORD_BATCH,
|
||||
TENSOR,
|
||||
SPARSE_TENSOR
|
||||
};
|
||||
|
||||
struct IpcReadOptions;
|
||||
struct IpcWriteOptions;
|
||||
|
||||
class MessageReader;
|
||||
|
||||
class RecordBatchStreamReader;
|
||||
class RecordBatchFileReader;
|
||||
class RecordBatchWriter;
|
||||
|
||||
namespace feather {
|
||||
|
||||
class Reader;
|
||||
|
||||
} // namespace feather
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,41 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace arrow {
|
||||
namespace ipc {
|
||||
|
||||
// Buffers are padded to 64-byte boundaries (for SIMD)
|
||||
static constexpr int32_t kArrowAlignment = 64;
|
||||
|
||||
// Tensors are padded to 64-byte boundaries
|
||||
static constexpr int32_t kTensorAlignment = 64;
|
||||
|
||||
// Align on 8-byte boundaries in IPC
|
||||
static constexpr int32_t kArrowIpcAlignment = 8;
|
||||
|
||||
static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0};
|
||||
|
||||
static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) {
|
||||
return ((nbytes + alignment - 1) / alignment) * alignment;
|
||||
}
|
||||
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,474 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Implement Arrow streaming binary format
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/ipc/dictionary.h" // IWYU pragma: export
|
||||
#include "arrow/ipc/message.h"
|
||||
#include "arrow/ipc/options.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
class Buffer;
|
||||
class MemoryManager;
|
||||
class MemoryPool;
|
||||
class RecordBatch;
|
||||
class Schema;
|
||||
class Status;
|
||||
class Table;
|
||||
class Tensor;
|
||||
class SparseTensor;
|
||||
|
||||
namespace io {
|
||||
|
||||
class OutputStream;
|
||||
|
||||
} // namespace io
|
||||
|
||||
namespace ipc {
|
||||
|
||||
/// \brief Intermediate data structure with metadata header, and zero
|
||||
/// or more buffers for the message body.
|
||||
struct IpcPayload {
|
||||
MessageType type = MessageType::NONE;
|
||||
std::shared_ptr<Buffer> metadata;
|
||||
std::vector<std::shared_ptr<Buffer>> body_buffers;
|
||||
int64_t body_length = 0; // serialized body length (padded, maybe compressed)
|
||||
int64_t raw_body_length = 0; // initial uncompressed body length
|
||||
};
|
||||
|
||||
struct WriteStats {
|
||||
/// Number of IPC messages written.
|
||||
int64_t num_messages = 0;
|
||||
/// Number of record batches written.
|
||||
int64_t num_record_batches = 0;
|
||||
/// Number of dictionary batches written.
|
||||
///
|
||||
/// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
|
||||
int64_t num_dictionary_batches = 0;
|
||||
|
||||
/// Number of dictionary deltas written.
|
||||
int64_t num_dictionary_deltas = 0;
|
||||
/// Number of replaced dictionaries (i.e. where a dictionary batch replaces
|
||||
/// an existing dictionary with an unrelated new dictionary).
|
||||
int64_t num_replaced_dictionaries = 0;
|
||||
|
||||
/// Total size in bytes of record batches emitted.
|
||||
/// The "raw" size counts the original buffer sizes, while the "serialized" size
|
||||
/// includes padding and (optionally) compression.
|
||||
int64_t total_raw_body_size = 0;
|
||||
int64_t total_serialized_body_size = 0;
|
||||
};
|
||||
|
||||
/// \class RecordBatchWriter
|
||||
/// \brief Abstract interface for writing a stream of record batches
|
||||
class ARROW_EXPORT RecordBatchWriter {
|
||||
public:
|
||||
virtual ~RecordBatchWriter();
|
||||
|
||||
/// \brief Write a record batch to the stream
|
||||
///
|
||||
/// \param[in] batch the record batch to write to the stream
|
||||
/// \return Status
|
||||
virtual Status WriteRecordBatch(const RecordBatch& batch) = 0;
|
||||
|
||||
/// \brief Write a record batch with custom metadata to the stream
|
||||
///
|
||||
/// \param[in] batch the record batch to write to the stream
|
||||
/// \param[in] custom_metadata the record batch's custom metadata to write to the stream
|
||||
/// \return Status
|
||||
virtual Status WriteRecordBatch(
|
||||
const RecordBatch& batch,
|
||||
const std::shared_ptr<const KeyValueMetadata>& custom_metadata);
|
||||
|
||||
/// \brief Write possibly-chunked table by creating sequence of record batches
|
||||
/// \param[in] table table to write
|
||||
/// \return Status
|
||||
Status WriteTable(const Table& table);
|
||||
|
||||
/// \brief Write Table with a particular chunksize
|
||||
/// \param[in] table table to write
|
||||
/// \param[in] max_chunksize maximum length of table chunks. To indicate
|
||||
/// that no maximum should be enforced, pass -1.
|
||||
/// \return Status
|
||||
virtual Status WriteTable(const Table& table, int64_t max_chunksize);
|
||||
|
||||
/// \brief Perform any logic necessary to finish the stream
|
||||
///
|
||||
/// \return Status
|
||||
virtual Status Close() = 0;
|
||||
|
||||
/// \brief Return current write statistics
|
||||
virtual WriteStats stats() const = 0;
|
||||
};
|
||||
|
||||
/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
|
||||
/// instances
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// Create a new IPC stream writer from stream sink and schema. User is
|
||||
/// responsible for closing the actual OutputStream.
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
|
||||
io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults());
|
||||
|
||||
/// Create a new IPC stream writer from stream sink and schema. User is
|
||||
/// responsible for closing the actual OutputStream.
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
|
||||
std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults());
|
||||
|
||||
/// Create a new IPC file writer from stream sink and schema
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization, optional
|
||||
/// \param[in] metadata custom metadata for File Footer, optional
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
|
||||
io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
|
||||
|
||||
/// Create a new IPC file writer from stream sink and schema
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization, optional
|
||||
/// \param[in] metadata custom metadata for File Footer, optional
|
||||
/// \return Result<std::shared_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
|
||||
std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \brief Low-level API for writing a record batch (without schema)
|
||||
/// to an OutputStream as encapsulated IPC message. See Arrow format
|
||||
/// documentation for more detail.
|
||||
///
|
||||
/// \param[in] batch the record batch to write
|
||||
/// \param[in] buffer_start_offset the start offset to use in the buffer metadata,
|
||||
/// generally should be 0
|
||||
/// \param[in] dst an OutputStream
|
||||
/// \param[out] metadata_length the size of the length-prefixed flatbuffer
|
||||
/// including padding to a 64-byte boundary
|
||||
/// \param[out] body_length the size of the contiguous buffer block plus
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
|
||||
io::OutputStream* dst, int32_t* metadata_length,
|
||||
int64_t* body_length, const IpcWriteOptions& options);
|
||||
|
||||
/// \brief Serialize record batch as encapsulated IPC message in a new buffer
|
||||
///
|
||||
/// \param[in] batch the record batch
|
||||
/// \param[in] options the IpcWriteOptions to use for serialization
|
||||
/// \return the serialized message
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
|
||||
const IpcWriteOptions& options);
|
||||
|
||||
/// \brief Serialize record batch as encapsulated IPC message in a new buffer
|
||||
///
|
||||
/// \param[in] batch the record batch
|
||||
/// \param[in] mm a MemoryManager to allocate memory from
|
||||
/// \return the serialized message
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
|
||||
std::shared_ptr<MemoryManager> mm);
|
||||
|
||||
/// \brief Write record batch to OutputStream
|
||||
///
|
||||
/// \param[in] batch the record batch to write
|
||||
/// \param[in] options the IpcWriteOptions to use for serialization
|
||||
/// \param[in] out the OutputStream to write the output to
|
||||
/// \return Status
|
||||
///
|
||||
/// If writing to pre-allocated memory, you can use
|
||||
/// arrow::ipc::GetRecordBatchSize to compute how much space is required
|
||||
ARROW_EXPORT
|
||||
Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options,
|
||||
io::OutputStream* out);
|
||||
|
||||
/// \brief Serialize schema as encapsulated IPC message
|
||||
///
|
||||
/// \param[in] schema the schema to write
|
||||
/// \param[in] pool a MemoryPool to allocate memory from
|
||||
/// \return the serialized schema
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SerializeSchema(const Schema& schema,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Write multiple record batches to OutputStream, including schema
|
||||
/// \param[in] batches a vector of batches. Must all have same schema
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] dst an OutputStream
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
|
||||
const IpcWriteOptions& options, io::OutputStream* dst);
|
||||
|
||||
/// \brief Compute the number of bytes needed to write an IPC payload
|
||||
/// including metadata
|
||||
///
|
||||
/// \param[in] payload the IPC payload to write
|
||||
/// \param[in] options write options
|
||||
/// \return the size of the complete encapsulated message
|
||||
ARROW_EXPORT
|
||||
int64_t GetPayloadSize(const IpcPayload& payload,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults());
|
||||
|
||||
/// \brief Compute the number of bytes needed to write a record batch including metadata
|
||||
///
|
||||
/// \param[in] batch the record batch to write
|
||||
/// \param[out] size the size of the complete encapsulated message
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
|
||||
|
||||
/// \brief Compute the number of bytes needed to write a record batch including metadata
|
||||
///
|
||||
/// \param[in] batch the record batch to write
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] size the size of the complete encapsulated message
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options,
|
||||
int64_t* size);
|
||||
|
||||
/// \brief Compute the number of bytes needed to write a tensor including metadata
|
||||
///
|
||||
/// \param[in] tensor the tensor to write
|
||||
/// \param[out] size the size of the complete encapsulated message
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetTensorSize(const Tensor& tensor, int64_t* size);
|
||||
|
||||
/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory
|
||||
/// allocation
|
||||
///
|
||||
/// \param[in] tensor the Tensor to write
|
||||
/// \param[in] pool MemoryPool to allocate space for metadata
|
||||
/// \return the resulting Message
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Message>> GetTensorMessage(const Tensor& tensor, MemoryPool* pool);
|
||||
|
||||
/// \brief Write arrow::Tensor as a contiguous message.
|
||||
///
|
||||
/// The metadata and body are written assuming 64-byte alignment. It is the
|
||||
/// user's responsibility to ensure that the OutputStream has been aligned
|
||||
/// to a 64-byte multiple before writing the message.
|
||||
///
|
||||
/// The message is written out as followed:
|
||||
/// \code
|
||||
/// <metadata size> <metadata> <tensor data>
|
||||
/// \endcode
|
||||
///
|
||||
/// \param[in] tensor the Tensor to write
|
||||
/// \param[in] dst the OutputStream to write to
|
||||
/// \param[out] metadata_length the actual metadata length, including padding
|
||||
/// \param[out] body_length the actual message body length
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
|
||||
int64_t* body_length);
|
||||
|
||||
/// \brief EXPERIMENTAL: Convert arrow::SparseTensor to a Message with minimal memory
|
||||
/// allocation
|
||||
///
|
||||
/// The message is written out as followed:
|
||||
/// \code
|
||||
/// <metadata size> <metadata> <sparse index> <sparse tensor body>
|
||||
/// \endcode
|
||||
///
|
||||
/// \param[in] sparse_tensor the SparseTensor to write
|
||||
/// \param[in] pool MemoryPool to allocate space for metadata
|
||||
/// \return the resulting Message
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Message>> GetSparseTensorMessage(const SparseTensor& sparse_tensor,
|
||||
MemoryPool* pool);
|
||||
|
||||
/// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous message. The metadata,
|
||||
/// sparse index, and body are written assuming 64-byte alignment. It is the
|
||||
/// user's responsibility to ensure that the OutputStream has been aligned
|
||||
/// to a 64-byte multiple before writing the message.
|
||||
///
|
||||
/// \param[in] sparse_tensor the SparseTensor to write
|
||||
/// \param[in] dst the OutputStream to write to
|
||||
/// \param[out] metadata_length the actual metadata length, including padding
|
||||
/// \param[out] body_length the actual message body length
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst,
|
||||
int32_t* metadata_length, int64_t* body_length);
|
||||
|
||||
/// \brief Compute IpcPayload for the given schema
|
||||
/// \param[in] schema the Schema that is being serialized
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[in] mapper object mapping dictionary fields to dictionary ids
|
||||
/// \param[out] out the returned vector of IpcPayloads
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options,
|
||||
const DictionaryFieldMapper& mapper, IpcPayload* out);
|
||||
|
||||
/// \brief Compute IpcPayload for a dictionary
|
||||
/// \param[in] id the dictionary id
|
||||
/// \param[in] dictionary the dictionary values
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] payload the output IpcPayload
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetDictionaryPayload(int64_t id, const std::shared_ptr<Array>& dictionary,
|
||||
const IpcWriteOptions& options, IpcPayload* payload);
|
||||
|
||||
/// \brief Compute IpcPayload for a dictionary
|
||||
/// \param[in] id the dictionary id
|
||||
/// \param[in] is_delta whether the dictionary is a delta dictionary
|
||||
/// \param[in] dictionary the dictionary values
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] payload the output IpcPayload
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetDictionaryPayload(int64_t id, bool is_delta,
|
||||
const std::shared_ptr<Array>& dictionary,
|
||||
const IpcWriteOptions& options, IpcPayload* payload);
|
||||
|
||||
/// \brief Compute IpcPayload for the given record batch
|
||||
/// \param[in] batch the RecordBatch that is being serialized
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] out the returned IpcPayload
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options,
|
||||
IpcPayload* out);
|
||||
|
||||
/// \brief Compute IpcPayload for the given record batch and custom metadata
|
||||
/// \param[in] batch the RecordBatch that is being serialized
|
||||
/// \param[in] custom_metadata the custom metadata to be serialized with the record batch
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[out] out the returned IpcPayload
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetRecordBatchPayload(
|
||||
const RecordBatch& batch,
|
||||
const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
|
||||
const IpcWriteOptions& options, IpcPayload* out);
|
||||
|
||||
/// \brief Write an IPC payload to the given stream.
|
||||
/// \param[in] payload the payload to write
|
||||
/// \param[in] options options for serialization
|
||||
/// \param[in] dst The stream to write the payload to.
|
||||
/// \param[out] metadata_length the length of the serialized metadata
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options,
|
||||
io::OutputStream* dst, int32_t* metadata_length);
|
||||
|
||||
/// \brief Compute IpcPayload for the given sparse tensor
|
||||
/// \param[in] sparse_tensor the SparseTensor that is being serialized
|
||||
/// \param[in,out] pool for any required temporary memory allocations
|
||||
/// \param[out] out the returned IpcPayload
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool,
|
||||
IpcPayload* out);
|
||||
|
||||
namespace internal {
|
||||
|
||||
// These internal APIs may change without warning or deprecation
|
||||
|
||||
class ARROW_EXPORT IpcPayloadWriter {
|
||||
public:
|
||||
virtual ~IpcPayloadWriter();
|
||||
|
||||
// Default implementation is a no-op
|
||||
virtual Status Start();
|
||||
|
||||
virtual Status WritePayload(const IpcPayload& payload) = 0;
|
||||
|
||||
virtual Status Close() = 0;
|
||||
};
|
||||
|
||||
/// Create a new IPC payload stream writer from stream sink. User is
|
||||
/// responsible for closing the actual OutputStream.
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::shared_ptr<IpcPayloadWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadStreamWriter(
|
||||
io::OutputStream* sink, const IpcWriteOptions& options = IpcWriteOptions::Defaults());
|
||||
|
||||
/// Create a new IPC payload file writer from stream sink.
|
||||
///
|
||||
/// \param[in] sink output stream to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization, optional
|
||||
/// \param[in] metadata custom metadata for File Footer, optional
|
||||
/// \return Status
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadFileWriter(
|
||||
io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
|
||||
const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
|
||||
|
||||
/// Create a new RecordBatchWriter from IpcPayloadWriter and schema.
|
||||
///
|
||||
/// The format is implicitly the IPC stream format (allowing dictionary
|
||||
/// replacement and deltas).
|
||||
///
|
||||
/// \param[in] sink the IpcPayloadWriter to write to
|
||||
/// \param[in] schema the schema of the record batches to be written
|
||||
/// \param[in] options options for serialization
|
||||
/// \return Result<std::unique_ptr<RecordBatchWriter>>
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<RecordBatchWriter>> OpenRecordBatchWriter(
|
||||
std::unique_ptr<IpcPayloadWriter> sink, const std::shared_ptr<Schema>& schema,
|
||||
const IpcWriteOptions& options = IpcWriteOptions::Defaults());
|
||||
|
||||
} // namespace internal
|
||||
} // namespace ipc
|
||||
} // namespace arrow
|
||||
Reference in New Issue
Block a user