Merging PR_218 openai_rev package with new streamlit chat app
This commit is contained in:
42
venv/lib/python3.9/site-packages/pyarrow/__init__.pxd
Normal file
42
venv/lib/python3.9/site-packages/pyarrow/__init__.pxd
Normal file
@@ -0,0 +1,42 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from libcpp.memory cimport shared_ptr
|
||||
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CDataType,
|
||||
CField, CRecordBatch, CSchema,
|
||||
CTable, CTensor, CSparseCOOTensor,
|
||||
CSparseCSRMatrix, CSparseCSCMatrix,
|
||||
CSparseCSFTensor)
|
||||
|
||||
cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
|
||||
cdef int import_pyarrow() except -1
|
||||
cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer)
|
||||
cdef object wrap_data_type(const shared_ptr[CDataType]& type)
|
||||
cdef object wrap_field(const shared_ptr[CField]& field)
|
||||
cdef object wrap_schema(const shared_ptr[CSchema]& schema)
|
||||
cdef object wrap_array(const shared_ptr[CArray]& sp_array)
|
||||
cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
|
||||
cdef object wrap_sparse_tensor_coo(
|
||||
const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor)
|
||||
cdef object wrap_sparse_tensor_csr(
|
||||
const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
|
||||
cdef object wrap_sparse_tensor_csc(
|
||||
const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor)
|
||||
cdef object wrap_sparse_tensor_csf(
|
||||
const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor)
|
||||
cdef object wrap_table(const shared_ptr[CTable]& ctable)
|
||||
cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
|
||||
509
venv/lib/python3.9/site-packages/pyarrow/__init__.py
Normal file
509
venv/lib/python3.9/site-packages/pyarrow/__init__.py
Normal file
@@ -0,0 +1,509 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
"""
|
||||
PyArrow is the python implementation of Apache Arrow.
|
||||
|
||||
Apache Arrow is a cross-language development platform for in-memory data.
|
||||
It specifies a standardized language-independent columnar memory format for
|
||||
flat and hierarchical data, organized for efficient analytic operations on
|
||||
modern hardware. It also provides computational libraries and zero-copy
|
||||
streaming messaging and interprocess communication.
|
||||
|
||||
For more information see the official page at https://arrow.apache.org
|
||||
"""
|
||||
|
||||
import gc as _gc
|
||||
import importlib as _importlib
|
||||
import os as _os
|
||||
import platform as _platform
|
||||
import sys as _sys
|
||||
import warnings as _warnings
|
||||
|
||||
try:
|
||||
from ._generated_version import version as __version__
|
||||
except ImportError:
|
||||
# Package is not installed, parse git tag at runtime
|
||||
try:
|
||||
import setuptools_scm
|
||||
# Code duplicated from setup.py to avoid a dependency on each other
|
||||
|
||||
def parse_git(root, **kwargs):
|
||||
"""
|
||||
Parse function for setuptools_scm that ignores tags for non-C++
|
||||
subprojects, e.g. apache-arrow-js-XXX tags.
|
||||
"""
|
||||
from setuptools_scm.git import parse
|
||||
kwargs['describe_command'] = \
|
||||
"git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'"
|
||||
return parse(root, **kwargs)
|
||||
__version__ = setuptools_scm.get_version('../',
|
||||
parse=parse_git)
|
||||
except ImportError:
|
||||
__version__ = None
|
||||
|
||||
# ARROW-8684: Disable GC while initializing Cython extension module,
|
||||
# to workaround Cython bug in https://github.com/cython/cython/issues/3603
|
||||
_gc_enabled = _gc.isenabled()
|
||||
_gc.disable()
|
||||
import pyarrow.lib as _lib
|
||||
if _gc_enabled:
|
||||
_gc.enable()
|
||||
|
||||
from pyarrow.lib import (BuildInfo, RuntimeInfo, MonthDayNano,
|
||||
VersionInfo, cpp_build_info, cpp_version,
|
||||
cpp_version_info, runtime_info, cpu_count,
|
||||
set_cpu_count, enable_signal_handlers,
|
||||
io_thread_count, set_io_thread_count)
|
||||
|
||||
|
||||
def show_versions():
|
||||
"""
|
||||
Print various version information, to help with error reporting.
|
||||
"""
|
||||
def print_entry(label, value):
|
||||
print(f"{label: <26}: {value: <8}")
|
||||
|
||||
print("pyarrow version info\n--------------------")
|
||||
print_entry("Package kind", cpp_build_info.package_kind
|
||||
if len(cpp_build_info.package_kind) > 0
|
||||
else "not indicated")
|
||||
print_entry("Arrow C++ library version", cpp_build_info.version)
|
||||
print_entry("Arrow C++ compiler",
|
||||
f"{cpp_build_info.compiler_id} {cpp_build_info.compiler_version}")
|
||||
print_entry("Arrow C++ compiler flags", cpp_build_info.compiler_flags)
|
||||
print_entry("Arrow C++ git revision", cpp_build_info.git_id)
|
||||
print_entry("Arrow C++ git description", cpp_build_info.git_description)
|
||||
print_entry("Arrow C++ build type", cpp_build_info.build_type)
|
||||
|
||||
|
||||
def _module_is_available(module):
|
||||
try:
|
||||
_importlib.import_module(f'pyarrow.{module}')
|
||||
except ImportError:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def _filesystem_is_available(fs):
|
||||
try:
|
||||
import pyarrow.fs
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
try:
|
||||
getattr(pyarrow.fs, fs)
|
||||
except (ImportError, AttributeError):
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
||||
def show_info():
|
||||
"""
|
||||
Print detailed version and platform information, for error reporting
|
||||
"""
|
||||
show_versions()
|
||||
|
||||
def print_entry(label, value):
|
||||
print(f" {label: <20}: {value: <8}")
|
||||
|
||||
print("\nPlatform:")
|
||||
print_entry("OS / Arch", f"{_platform.system()} {_platform.machine()}")
|
||||
print_entry("SIMD Level", runtime_info().simd_level)
|
||||
print_entry("Detected SIMD Level", runtime_info().detected_simd_level)
|
||||
|
||||
pool = default_memory_pool()
|
||||
print("\nMemory:")
|
||||
print_entry("Default backend", pool.backend_name)
|
||||
print_entry("Bytes allocated", f"{pool.bytes_allocated()} bytes")
|
||||
print_entry("Max memory", f"{pool.max_memory()} bytes")
|
||||
print_entry("Supported Backends", ', '.join(supported_memory_backends()))
|
||||
|
||||
print("\nOptional modules:")
|
||||
modules = ["csv", "cuda", "dataset", "feather", "flight", "fs", "gandiva", "json",
|
||||
"orc", "parquet", "plasma"]
|
||||
for module in modules:
|
||||
status = "Enabled" if _module_is_available(module) else "-"
|
||||
print(f" {module: <20}: {status: <8}")
|
||||
|
||||
print("\nFilesystems:")
|
||||
filesystems = ["GcsFileSystem", "HadoopFileSystem", "S3FileSystem"]
|
||||
for fs in filesystems:
|
||||
status = "Enabled" if _filesystem_is_available(fs) else "-"
|
||||
print(f" {fs: <20}: {status: <8}")
|
||||
|
||||
print("\nCompression Codecs:")
|
||||
codecs = ["brotli", "bz2", "gzip", "lz4_frame", "lz4", "snappy", "zstd"]
|
||||
for codec in codecs:
|
||||
status = "Enabled" if Codec.is_available(codec) else "-"
|
||||
print(f" {codec: <20}: {status: <8}")
|
||||
|
||||
|
||||
from pyarrow.lib import (null, bool_,
|
||||
int8, int16, int32, int64,
|
||||
uint8, uint16, uint32, uint64,
|
||||
time32, time64, timestamp, date32, date64, duration,
|
||||
month_day_nano_interval,
|
||||
float16, float32, float64,
|
||||
binary, string, utf8,
|
||||
large_binary, large_string, large_utf8,
|
||||
decimal128, decimal256,
|
||||
list_, large_list, map_, struct,
|
||||
union, sparse_union, dense_union,
|
||||
dictionary,
|
||||
field,
|
||||
type_for_alias,
|
||||
DataType, DictionaryType, StructType,
|
||||
ListType, LargeListType, MapType, FixedSizeListType,
|
||||
UnionType, SparseUnionType, DenseUnionType,
|
||||
TimestampType, Time32Type, Time64Type, DurationType,
|
||||
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
|
||||
BaseExtensionType, ExtensionType,
|
||||
PyExtensionType, UnknownExtensionType,
|
||||
register_extension_type, unregister_extension_type,
|
||||
DictionaryMemo,
|
||||
KeyValueMetadata,
|
||||
Field,
|
||||
Schema,
|
||||
schema,
|
||||
unify_schemas,
|
||||
Array, Tensor,
|
||||
array, chunked_array, record_batch, nulls, repeat,
|
||||
SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
|
||||
SparseCSFTensor,
|
||||
infer_type, from_numpy_dtype,
|
||||
NullArray,
|
||||
NumericArray, IntegerArray, FloatingPointArray,
|
||||
BooleanArray,
|
||||
Int8Array, UInt8Array,
|
||||
Int16Array, UInt16Array,
|
||||
Int32Array, UInt32Array,
|
||||
Int64Array, UInt64Array,
|
||||
ListArray, LargeListArray, MapArray,
|
||||
FixedSizeListArray, UnionArray,
|
||||
BinaryArray, StringArray,
|
||||
LargeBinaryArray, LargeStringArray,
|
||||
FixedSizeBinaryArray,
|
||||
DictionaryArray,
|
||||
Date32Array, Date64Array, TimestampArray,
|
||||
Time32Array, Time64Array, DurationArray,
|
||||
MonthDayNanoIntervalArray,
|
||||
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
|
||||
scalar, NA, _NULL as NULL, Scalar,
|
||||
NullScalar, BooleanScalar,
|
||||
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
|
||||
UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
|
||||
HalfFloatScalar, FloatScalar, DoubleScalar,
|
||||
Decimal128Scalar, Decimal256Scalar,
|
||||
ListScalar, LargeListScalar, FixedSizeListScalar,
|
||||
Date32Scalar, Date64Scalar,
|
||||
Time32Scalar, Time64Scalar,
|
||||
TimestampScalar, DurationScalar,
|
||||
MonthDayNanoIntervalScalar,
|
||||
BinaryScalar, LargeBinaryScalar,
|
||||
StringScalar, LargeStringScalar,
|
||||
FixedSizeBinaryScalar, DictionaryScalar,
|
||||
MapScalar, StructScalar, UnionScalar,
|
||||
ExtensionScalar)
|
||||
|
||||
# Buffers, allocation
|
||||
from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
|
||||
Codec, compress, decompress, allocate_buffer)
|
||||
|
||||
from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
|
||||
total_allocated_bytes, set_memory_pool,
|
||||
default_memory_pool, system_memory_pool,
|
||||
jemalloc_memory_pool, mimalloc_memory_pool,
|
||||
logging_memory_pool, proxy_memory_pool,
|
||||
log_memory_allocations, jemalloc_set_decay_ms,
|
||||
supported_memory_backends)
|
||||
|
||||
# I/O
|
||||
from pyarrow.lib import (NativeFile, PythonFile,
|
||||
BufferedInputStream, BufferedOutputStream,
|
||||
CompressedInputStream, CompressedOutputStream,
|
||||
TransformInputStream, transcoding_input_stream,
|
||||
FixedSizeBufferWriter,
|
||||
BufferReader, BufferOutputStream,
|
||||
OSFile, MemoryMappedFile, memory_map,
|
||||
create_memory_map, MockOutputStream,
|
||||
input_stream, output_stream)
|
||||
|
||||
from pyarrow._hdfsio import HdfsFile, have_libhdfs
|
||||
|
||||
from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
|
||||
concat_arrays, concat_tables, TableGroupBy,
|
||||
RecordBatchReader)
|
||||
|
||||
# Exceptions
|
||||
from pyarrow.lib import (ArrowCancelled,
|
||||
ArrowCapacityError,
|
||||
ArrowException,
|
||||
ArrowKeyError,
|
||||
ArrowIndexError,
|
||||
ArrowInvalid,
|
||||
ArrowIOError,
|
||||
ArrowMemoryError,
|
||||
ArrowNotImplementedError,
|
||||
ArrowTypeError,
|
||||
ArrowSerializationError)
|
||||
|
||||
# Serialization
|
||||
from pyarrow.lib import (deserialize_from, deserialize,
|
||||
deserialize_components,
|
||||
serialize, serialize_to, read_serialized,
|
||||
SerializationCallbackError,
|
||||
DeserializationCallbackError)
|
||||
|
||||
import pyarrow.hdfs as hdfs
|
||||
|
||||
from pyarrow.ipc import serialize_pandas, deserialize_pandas
|
||||
import pyarrow.ipc as ipc
|
||||
|
||||
from pyarrow.serialization import (default_serialization_context,
|
||||
register_default_serialization_handlers,
|
||||
register_torch_serialization_handlers)
|
||||
|
||||
import pyarrow.types as types
|
||||
|
||||
|
||||
# deprecated top-level access
|
||||
|
||||
|
||||
from pyarrow.filesystem import FileSystem as _FileSystem
|
||||
from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
|
||||
from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
|
||||
|
||||
from pyarrow.lib import SerializationContext as _SerializationContext
|
||||
from pyarrow.lib import SerializedPyObject as _SerializedPyObject
|
||||
|
||||
|
||||
_localfs = _LocalFileSystem._get_instance()
|
||||
|
||||
|
||||
_msg = (
|
||||
"pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
|
||||
)
|
||||
|
||||
_serialization_msg = (
|
||||
"'pyarrow.{0}' is deprecated and will be removed in a future version. "
|
||||
"Use pickle or the pyarrow IPC functionality instead."
|
||||
)
|
||||
|
||||
_deprecated = {
|
||||
"localfs": (_localfs, "LocalFileSystem"),
|
||||
"FileSystem": (_FileSystem, "FileSystem"),
|
||||
"LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"),
|
||||
"HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
|
||||
}
|
||||
|
||||
_serialization_deprecatd = {
|
||||
"SerializationContext": _SerializationContext,
|
||||
"SerializedPyObject": _SerializedPyObject,
|
||||
}
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
if name in _deprecated:
|
||||
obj, new_name = _deprecated[name]
|
||||
_warnings.warn(_msg.format(name, new_name),
|
||||
FutureWarning, stacklevel=2)
|
||||
return obj
|
||||
elif name in _serialization_deprecatd:
|
||||
_warnings.warn(_serialization_msg.format(name),
|
||||
FutureWarning, stacklevel=2)
|
||||
return _serialization_deprecatd[name]
|
||||
|
||||
raise AttributeError(
|
||||
"module 'pyarrow' has no attribute '{0}'".format(name)
|
||||
)
|
||||
|
||||
|
||||
# Entry point for starting the plasma store
|
||||
|
||||
|
||||
def _plasma_store_entry_point():
|
||||
"""
|
||||
DEPRECATED: Entry point for starting the plasma store.
|
||||
|
||||
This can be used by invoking e.g.
|
||||
``plasma_store -s /tmp/plasma -m 1000000000``
|
||||
from the command line and will start the plasma_store executable with the
|
||||
given arguments.
|
||||
|
||||
.. deprecated:: 10.0.0
|
||||
Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.
|
||||
"""
|
||||
_warnings.warn(
|
||||
"Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.",
|
||||
DeprecationWarning)
|
||||
|
||||
import pyarrow
|
||||
plasma_store_executable = _os.path.join(pyarrow.__path__[0],
|
||||
"plasma-store-server")
|
||||
_os.execv(plasma_store_executable, _sys.argv)
|
||||
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Deprecations
|
||||
|
||||
from pyarrow.util import _deprecate_api, _deprecate_class
|
||||
|
||||
|
||||
# TODO: Deprecate these somehow in the pyarrow namespace
|
||||
from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
|
||||
RecordBatchFileReader, RecordBatchFileWriter,
|
||||
RecordBatchStreamReader, RecordBatchStreamWriter)
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
|
||||
# wheels)
|
||||
|
||||
|
||||
def get_include():
|
||||
"""
|
||||
Return absolute path to directory containing Arrow C++ include
|
||||
headers. Similar to numpy.get_include
|
||||
"""
|
||||
return _os.path.join(_os.path.dirname(__file__), 'include')
|
||||
|
||||
|
||||
def _get_pkg_config_executable():
|
||||
return _os.environ.get('PKG_CONFIG', 'pkg-config')
|
||||
|
||||
|
||||
def _has_pkg_config(pkgname):
|
||||
import subprocess
|
||||
try:
|
||||
return subprocess.call([_get_pkg_config_executable(),
|
||||
'--exists', pkgname]) == 0
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
|
||||
|
||||
def _read_pkg_config_variable(pkgname, cli_args):
|
||||
import subprocess
|
||||
cmd = [_get_pkg_config_executable(), pkgname] + cli_args
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
out, err = proc.communicate()
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
|
||||
return out.rstrip().decode('utf8')
|
||||
|
||||
|
||||
def get_libraries():
|
||||
"""
|
||||
Return list of library names to include in the `libraries` argument for C
|
||||
or Cython extensions using pyarrow
|
||||
"""
|
||||
return ['arrow_python', 'arrow']
|
||||
|
||||
|
||||
def create_library_symlinks():
|
||||
"""
|
||||
With Linux and macOS wheels, the bundled shared libraries have an embedded
|
||||
ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
|
||||
with -larrow won't work unless we create symlinks at locations like
|
||||
site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
|
||||
prior problems we had with shipping two copies of the shared libraries to
|
||||
permit third party projects like turbodbc to build their C++ extensions
|
||||
against the pyarrow wheels.
|
||||
|
||||
This function must only be invoked once and only when the shared libraries
|
||||
are bundled with the Python package, which should only apply to wheel-based
|
||||
installs. It requires write access to the site-packages/pyarrow directory
|
||||
and so depending on your system may need to be run with root.
|
||||
"""
|
||||
import glob
|
||||
if _sys.platform == 'win32':
|
||||
return
|
||||
package_cwd = _os.path.dirname(__file__)
|
||||
|
||||
if _sys.platform == 'linux':
|
||||
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
|
||||
|
||||
def get_symlink_path(hard_path):
|
||||
return hard_path.rsplit('.', 1)[0]
|
||||
else:
|
||||
bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
|
||||
|
||||
def get_symlink_path(hard_path):
|
||||
return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
|
||||
|
||||
for lib_hard_path in bundled_libs:
|
||||
symlink_path = get_symlink_path(lib_hard_path)
|
||||
if _os.path.exists(symlink_path):
|
||||
continue
|
||||
try:
|
||||
_os.symlink(lib_hard_path, symlink_path)
|
||||
except PermissionError:
|
||||
print("Tried creating symlink {}. If you need to link to "
|
||||
"bundled shared libraries, run "
|
||||
"pyarrow.create_library_symlinks() as root")
|
||||
|
||||
|
||||
def get_library_dirs():
|
||||
"""
|
||||
Return lists of directories likely to contain Arrow C++ libraries for
|
||||
linking C or Cython extensions using pyarrow
|
||||
"""
|
||||
package_cwd = _os.path.dirname(__file__)
|
||||
library_dirs = [package_cwd]
|
||||
|
||||
def append_library_dir(library_dir):
|
||||
if library_dir not in library_dirs:
|
||||
library_dirs.append(library_dir)
|
||||
|
||||
# Search library paths via pkg-config. This is necessary if the user
|
||||
# installed libarrow and the other shared libraries manually and they
|
||||
# are not shipped inside the pyarrow package (see also ARROW-2976).
|
||||
pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
|
||||
for pkgname in ["arrow", "arrow_python"]:
|
||||
if _has_pkg_config(pkgname):
|
||||
library_dir = _read_pkg_config_variable(pkgname,
|
||||
["--libs-only-L"])
|
||||
# pkg-config output could be empty if Arrow is installed
|
||||
# as a system package.
|
||||
if library_dir:
|
||||
if not library_dir.startswith("-L"):
|
||||
raise ValueError(
|
||||
"pkg-config --libs-only-L returned unexpected "
|
||||
"value {!r}".format(library_dir))
|
||||
append_library_dir(library_dir[2:])
|
||||
|
||||
if _sys.platform == 'win32':
|
||||
# TODO(wesm): Is this necessary, or does setuptools within a conda
|
||||
# installation add Library\lib to the linker path for MSVC?
|
||||
python_base_install = _os.path.dirname(_sys.executable)
|
||||
library_dir = _os.path.join(python_base_install, 'Library', 'lib')
|
||||
|
||||
if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
|
||||
append_library_dir(library_dir)
|
||||
|
||||
# ARROW-4074: Allow for ARROW_HOME to be set to some other directory
|
||||
if _os.environ.get('ARROW_HOME'):
|
||||
append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
|
||||
else:
|
||||
# Python wheels bundle the Arrow libraries in the pyarrow directory.
|
||||
append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
|
||||
|
||||
return library_dirs
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_compute.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_compute.cpython-39-darwin.so
Executable file
Binary file not shown.
64
venv/lib/python3.9/site-packages/pyarrow/_compute.pxd
Normal file
64
venv/lib/python3.9/site-packages/pyarrow/_compute.pxd
Normal file
@@ -0,0 +1,64 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
|
||||
cdef class ScalarUdfContext(_Weakrefable):
|
||||
cdef:
|
||||
CScalarUdfContext c_context
|
||||
|
||||
cdef void init(self, const CScalarUdfContext& c_context)
|
||||
|
||||
|
||||
cdef class FunctionOptions(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CFunctionOptions] wrapped
|
||||
|
||||
cdef const CFunctionOptions* get_options(self) except NULL
|
||||
cdef void init(self, const shared_ptr[CFunctionOptions]& sp)
|
||||
|
||||
cdef inline shared_ptr[CFunctionOptions] unwrap(self)
|
||||
|
||||
|
||||
cdef class _SortOptions(FunctionOptions):
|
||||
pass
|
||||
|
||||
|
||||
cdef CExpression _bind(Expression filter, Schema schema) except *
|
||||
|
||||
|
||||
cdef class Expression(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
CExpression expr
|
||||
|
||||
cdef void init(self, const CExpression& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const CExpression& sp)
|
||||
|
||||
cdef inline CExpression unwrap(self)
|
||||
|
||||
@staticmethod
|
||||
cdef Expression _expr_or_scalar(object expr)
|
||||
|
||||
|
||||
cdef CExpression _true
|
||||
2715
venv/lib/python3.9/site-packages/pyarrow/_compute.pyx
Normal file
2715
venv/lib/python3.9/site-packages/pyarrow/_compute.pyx
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,56 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
Custom documentation additions for compute functions.
|
||||
"""
|
||||
|
||||
function_doc_additions = {}
|
||||
|
||||
function_doc_additions["filter"] = """
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> arr = pa.array(["a", "b", "c", None, "e"])
|
||||
>>> mask = pa.array([True, False, None, False, True])
|
||||
>>> arr.filter(mask)
|
||||
<pyarrow.lib.StringArray object at ...>
|
||||
[
|
||||
"a",
|
||||
"e"
|
||||
]
|
||||
>>> arr.filter(mask, null_selection_behavior='emit_null')
|
||||
<pyarrow.lib.StringArray object at ...>
|
||||
[
|
||||
"a",
|
||||
null,
|
||||
"e"
|
||||
]
|
||||
"""
|
||||
|
||||
function_doc_additions["mode"] = """
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2])
|
||||
>>> modes = pc.mode(arr, 2)
|
||||
>>> modes[0]
|
||||
<pyarrow.StructScalar: [('mode', 2), ('count', 5)]>
|
||||
>>> modes[1]
|
||||
<pyarrow.StructScalar: [('mode', 1), ('count', 2)]>
|
||||
"""
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_csv.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_csv.cpython-39-darwin.so
Executable file
Binary file not shown.
55
venv/lib/python3.9/site-packages/pyarrow/_csv.pxd
Normal file
55
venv/lib/python3.9/site-packages/pyarrow/_csv.pxd
Normal file
@@ -0,0 +1,55 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport _Weakrefable
|
||||
|
||||
|
||||
cdef class ConvertOptions(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[CCSVConvertOptions] options
|
||||
|
||||
@staticmethod
|
||||
cdef ConvertOptions wrap(CCSVConvertOptions options)
|
||||
|
||||
|
||||
cdef class ParseOptions(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[CCSVParseOptions] options
|
||||
object _invalid_row_handler
|
||||
|
||||
@staticmethod
|
||||
cdef ParseOptions wrap(CCSVParseOptions options)
|
||||
|
||||
|
||||
cdef class ReadOptions(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[CCSVReadOptions] options
|
||||
public object encoding
|
||||
|
||||
@staticmethod
|
||||
cdef ReadOptions wrap(CCSVReadOptions options)
|
||||
|
||||
|
||||
cdef class WriteOptions(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[CCSVWriteOptions] options
|
||||
|
||||
@staticmethod
|
||||
cdef WriteOptions wrap(CCSVWriteOptions options)
|
||||
1498
venv/lib/python3.9/site-packages/pyarrow/_csv.pyx
Normal file
1498
venv/lib/python3.9/site-packages/pyarrow/_csv.pyx
Normal file
File diff suppressed because it is too large
Load Diff
67
venv/lib/python3.9/site-packages/pyarrow/_cuda.pxd
Normal file
67
venv/lib/python3.9/site-packages/pyarrow/_cuda.pxd
Normal file
@@ -0,0 +1,67 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_cuda cimport *
|
||||
|
||||
|
||||
cdef class Context(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CCudaContext] context
|
||||
int device_number
|
||||
|
||||
cdef void init(self, const shared_ptr[CCudaContext]& ctx)
|
||||
|
||||
|
||||
cdef class IpcMemHandle(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CCudaIpcMemHandle] handle
|
||||
|
||||
cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h)
|
||||
|
||||
|
||||
cdef class CudaBuffer(Buffer):
|
||||
cdef:
|
||||
shared_ptr[CCudaBuffer] cuda_buffer
|
||||
object base
|
||||
|
||||
cdef void init_cuda(self,
|
||||
const shared_ptr[CCudaBuffer]& buffer,
|
||||
object base)
|
||||
|
||||
|
||||
cdef class HostBuffer(Buffer):
|
||||
cdef:
|
||||
shared_ptr[CCudaHostBuffer] host_buffer
|
||||
|
||||
cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer)
|
||||
|
||||
|
||||
cdef class BufferReader(NativeFile):
|
||||
cdef:
|
||||
CCudaBufferReader* reader
|
||||
CudaBuffer buffer
|
||||
|
||||
|
||||
cdef class BufferWriter(NativeFile):
|
||||
cdef:
|
||||
CCudaBufferWriter* writer
|
||||
CudaBuffer buffer
|
||||
1060
venv/lib/python3.9/site-packages/pyarrow/_cuda.pyx
Normal file
1060
venv/lib/python3.9/site-packages/pyarrow/_cuda.pyx
Normal file
File diff suppressed because it is too large
Load Diff
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset.cpython-39-darwin.so
Executable file
Binary file not shown.
181
venv/lib/python3.9/site-packages/pyarrow/_dataset.pxd
Normal file
181
venv/lib/python3.9/site-packages/pyarrow/_dataset.pxd
Normal file
@@ -0,0 +1,181 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
"""Dataset is currently unstable. APIs subject to change without notice."""
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow_dataset cimport *
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow._fs cimport FileSystem
|
||||
|
||||
|
||||
cdef CFileSource _make_file_source(object file, FileSystem filesystem=*)
|
||||
|
||||
|
||||
cdef class DatasetFactory(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CDatasetFactory] wrapped
|
||||
CDatasetFactory* factory
|
||||
|
||||
cdef init(self, const shared_ptr[CDatasetFactory]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CDatasetFactory]& sp)
|
||||
|
||||
cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil
|
||||
|
||||
|
||||
cdef class Dataset(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CDataset] wrapped
|
||||
CDataset* dataset
|
||||
public dict _scan_options
|
||||
|
||||
cdef void init(self, const shared_ptr[CDataset]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CDataset]& sp)
|
||||
|
||||
cdef shared_ptr[CDataset] unwrap(self) nogil
|
||||
|
||||
|
||||
cdef class Scanner(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CScanner] wrapped
|
||||
CScanner* scanner
|
||||
|
||||
cdef void init(self, const shared_ptr[CScanner]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CScanner]& sp)
|
||||
|
||||
cdef shared_ptr[CScanner] unwrap(self)
|
||||
|
||||
@staticmethod
|
||||
cdef shared_ptr[CScanOptions] _make_scan_options(Dataset dataset, dict py_scanoptions) except *
|
||||
|
||||
|
||||
cdef class FragmentScanOptions(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CFragmentScanOptions] wrapped
|
||||
|
||||
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CFragmentScanOptions]& sp)
|
||||
|
||||
|
||||
cdef class FileFormat(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CFileFormat] wrapped
|
||||
CFileFormat* format
|
||||
|
||||
cdef void init(self, const shared_ptr[CFileFormat]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CFileFormat]& sp)
|
||||
|
||||
cdef inline shared_ptr[CFileFormat] unwrap(self)
|
||||
|
||||
cdef _set_default_fragment_scan_options(self, FragmentScanOptions options)
|
||||
|
||||
# Return a WrittenFile after a file was written.
|
||||
# May be overridden by subclasses, e.g. to add metadata.
|
||||
cdef WrittenFile _finish_write(self, path, base_dir,
|
||||
CFileWriter* file_writer)
|
||||
|
||||
|
||||
cdef class FileWriteOptions(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CFileWriteOptions] wrapped
|
||||
CFileWriteOptions* c_options
|
||||
|
||||
cdef void init(self, const shared_ptr[CFileWriteOptions]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CFileWriteOptions]& sp)
|
||||
|
||||
cdef inline shared_ptr[CFileWriteOptions] unwrap(self)
|
||||
|
||||
|
||||
cdef class Fragment(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CFragment] wrapped
|
||||
CFragment* fragment
|
||||
|
||||
cdef void init(self, const shared_ptr[CFragment]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CFragment]& sp)
|
||||
|
||||
cdef inline shared_ptr[CFragment] unwrap(self)
|
||||
|
||||
|
||||
cdef class FileFragment(Fragment):
|
||||
|
||||
cdef:
|
||||
CFileFragment* file_fragment
|
||||
|
||||
cdef void init(self, const shared_ptr[CFragment]& sp)
|
||||
|
||||
|
||||
cdef class Partitioning(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CPartitioning] wrapped
|
||||
CPartitioning* partitioning
|
||||
|
||||
cdef init(self, const shared_ptr[CPartitioning]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CPartitioning]& sp)
|
||||
|
||||
cdef inline shared_ptr[CPartitioning] unwrap(self)
|
||||
|
||||
|
||||
cdef class PartitioningFactory(_Weakrefable):
|
||||
|
||||
cdef:
|
||||
shared_ptr[CPartitioningFactory] wrapped
|
||||
CPartitioningFactory* factory
|
||||
|
||||
cdef init(self, const shared_ptr[CPartitioningFactory]& sp)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CPartitioningFactory]& sp)
|
||||
|
||||
cdef inline shared_ptr[CPartitioningFactory] unwrap(self)
|
||||
|
||||
|
||||
cdef class WrittenFile(_Weakrefable):
|
||||
|
||||
# The full path to the created file
|
||||
cdef public str path
|
||||
# Optional Parquet metadata
|
||||
# This metadata will have the file path attribute set to the path of
|
||||
# the written file.
|
||||
cdef public object metadata
|
||||
# The size of the file in bytes
|
||||
cdef public int64_t size
|
||||
3016
venv/lib/python3.9/site-packages/pyarrow/_dataset.pyx
Normal file
3016
venv/lib/python3.9/site-packages/pyarrow/_dataset.pyx
Normal file
File diff suppressed because it is too large
Load Diff
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset_orc.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset_orc.cpython-39-darwin.so
Executable file
Binary file not shown.
42
venv/lib/python3.9/site-packages/pyarrow/_dataset_orc.pyx
Normal file
42
venv/lib/python3.9/site-packages/pyarrow/_dataset_orc.pyx
Normal file
@@ -0,0 +1,42 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
"""Dataset support for ORC file format."""
|
||||
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_dataset cimport *
|
||||
|
||||
from pyarrow._dataset cimport FileFormat
|
||||
|
||||
|
||||
cdef class OrcFileFormat(FileFormat):
|
||||
|
||||
def __init__(self):
|
||||
self.init(shared_ptr[CFileFormat](new COrcFileFormat()))
|
||||
|
||||
def equals(self, OrcFileFormat other):
|
||||
return True
|
||||
|
||||
@property
|
||||
def default_extname(self):
|
||||
return "orc"
|
||||
|
||||
def __reduce__(self):
|
||||
return OrcFileFormat, tuple()
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset_parquet.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_dataset_parquet.cpython-39-darwin.so
Executable file
Binary file not shown.
862
venv/lib/python3.9/site-packages/pyarrow/_dataset_parquet.pyx
Normal file
862
venv/lib/python3.9/site-packages/pyarrow/_dataset_parquet.pyx
Normal file
@@ -0,0 +1,862 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
"""Dataset support for Parquest file format."""
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow.lib import frombytes, tobytes
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_dataset cimport *
|
||||
from pyarrow.includes.libarrow_dataset_parquet cimport *
|
||||
from pyarrow._fs cimport FileSystem
|
||||
from pyarrow.util import _is_path_like, _stringify_path
|
||||
|
||||
from pyarrow._compute cimport Expression, _bind
|
||||
from pyarrow._dataset cimport (
|
||||
_make_file_source,
|
||||
DatasetFactory,
|
||||
FileFormat,
|
||||
FileFragment,
|
||||
FileWriteOptions,
|
||||
Fragment,
|
||||
FragmentScanOptions,
|
||||
Partitioning,
|
||||
PartitioningFactory,
|
||||
WrittenFile
|
||||
)
|
||||
|
||||
|
||||
from pyarrow._parquet cimport (
|
||||
_create_writer_properties, _create_arrow_writer_properties,
|
||||
FileMetaData, RowGroupMetaData, ColumnChunkMetaData
|
||||
)
|
||||
|
||||
|
||||
cdef Expression _true = Expression._scalar(True)
|
||||
|
||||
|
||||
ctypedef CParquetFileWriter* _CParquetFileWriterPtr
|
||||
|
||||
|
||||
cdef class ParquetFileFormat(FileFormat):
|
||||
"""
|
||||
FileFormat for Parquet
|
||||
|
||||
Parameters
|
||||
----------
|
||||
read_options : ParquetReadOptions
|
||||
Read options for the file.
|
||||
default_fragment_scan_options : ParquetFragmentScanOptions
|
||||
Scan Options for the file.
|
||||
**kwargs : dict
|
||||
Additional options for read option or scan option
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CParquetFileFormat* parquet_format
|
||||
|
||||
def __init__(self, read_options=None,
|
||||
default_fragment_scan_options=None, **kwargs):
|
||||
cdef:
|
||||
shared_ptr[CParquetFileFormat] wrapped
|
||||
CParquetFileFormatReaderOptions* options
|
||||
|
||||
# Read/scan options
|
||||
read_options_args = {option: kwargs[option] for option in kwargs
|
||||
if option in _PARQUET_READ_OPTIONS}
|
||||
scan_args = {option: kwargs[option] for option in kwargs
|
||||
if option not in _PARQUET_READ_OPTIONS}
|
||||
if read_options and read_options_args:
|
||||
duplicates = ', '.join(sorted(read_options_args))
|
||||
raise ValueError(f'If `read_options` is given, '
|
||||
f'cannot specify {duplicates}')
|
||||
if default_fragment_scan_options and scan_args:
|
||||
duplicates = ', '.join(sorted(scan_args))
|
||||
raise ValueError(f'If `default_fragment_scan_options` is given, '
|
||||
f'cannot specify {duplicates}')
|
||||
|
||||
if read_options is None:
|
||||
read_options = ParquetReadOptions(**read_options_args)
|
||||
elif isinstance(read_options, dict):
|
||||
# For backwards compatibility
|
||||
duplicates = []
|
||||
for option, value in read_options.items():
|
||||
if option in _PARQUET_READ_OPTIONS:
|
||||
read_options_args[option] = value
|
||||
else:
|
||||
duplicates.append(option)
|
||||
scan_args[option] = value
|
||||
if duplicates:
|
||||
duplicates = ", ".join(duplicates)
|
||||
warnings.warn(f'The scan options {duplicates} should be '
|
||||
'specified directly as keyword arguments')
|
||||
read_options = ParquetReadOptions(**read_options_args)
|
||||
elif not isinstance(read_options, ParquetReadOptions):
|
||||
raise TypeError('`read_options` must be either a dictionary or an '
|
||||
'instance of ParquetReadOptions')
|
||||
|
||||
if default_fragment_scan_options is None:
|
||||
default_fragment_scan_options = ParquetFragmentScanOptions(
|
||||
**scan_args)
|
||||
elif isinstance(default_fragment_scan_options, dict):
|
||||
default_fragment_scan_options = ParquetFragmentScanOptions(
|
||||
**default_fragment_scan_options)
|
||||
elif not isinstance(default_fragment_scan_options,
|
||||
ParquetFragmentScanOptions):
|
||||
raise TypeError('`default_fragment_scan_options` must be either a '
|
||||
'dictionary or an instance of '
|
||||
'ParquetFragmentScanOptions')
|
||||
|
||||
wrapped = make_shared[CParquetFileFormat]()
|
||||
options = &(wrapped.get().reader_options)
|
||||
if read_options.dictionary_columns is not None:
|
||||
for column in read_options.dictionary_columns:
|
||||
options.dict_columns.insert(tobytes(column))
|
||||
options.coerce_int96_timestamp_unit = \
|
||||
read_options._coerce_int96_timestamp_unit
|
||||
|
||||
self.init(<shared_ptr[CFileFormat]> wrapped)
|
||||
self.default_fragment_scan_options = default_fragment_scan_options
|
||||
|
||||
cdef void init(self, const shared_ptr[CFileFormat]& sp):
|
||||
FileFormat.init(self, sp)
|
||||
self.parquet_format = <CParquetFileFormat*> sp.get()
|
||||
|
||||
cdef WrittenFile _finish_write(self, path, base_dir,
|
||||
CFileWriter* file_writer):
|
||||
cdef:
|
||||
FileMetaData parquet_metadata
|
||||
CParquetFileWriter* parquet_file_writer
|
||||
|
||||
parquet_metadata = None
|
||||
parquet_file_writer = dynamic_cast[_CParquetFileWriterPtr](file_writer)
|
||||
with nogil:
|
||||
metadata = deref(
|
||||
deref(parquet_file_writer).parquet_writer()).metadata()
|
||||
if metadata:
|
||||
parquet_metadata = FileMetaData()
|
||||
parquet_metadata.init(metadata)
|
||||
parquet_metadata.set_file_path(os.path.relpath(path, base_dir))
|
||||
|
||||
size = GetResultValue(file_writer.GetBytesWritten())
|
||||
|
||||
return WrittenFile(path, parquet_metadata, size)
|
||||
|
||||
@property
|
||||
def read_options(self):
|
||||
cdef CParquetFileFormatReaderOptions* options
|
||||
options = &self.parquet_format.reader_options
|
||||
parquet_read_options = ParquetReadOptions(
|
||||
dictionary_columns={frombytes(col)
|
||||
for col in options.dict_columns},
|
||||
)
|
||||
# Read options getter/setter works with strings so setting
|
||||
# the private property which uses the C Type
|
||||
parquet_read_options._coerce_int96_timestamp_unit = \
|
||||
options.coerce_int96_timestamp_unit
|
||||
return parquet_read_options
|
||||
|
||||
def make_write_options(self, **kwargs):
|
||||
opts = FileFormat.make_write_options(self)
|
||||
(<ParquetFileWriteOptions> opts).update(**kwargs)
|
||||
return opts
|
||||
|
||||
cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
|
||||
if options.type_name == 'parquet':
|
||||
self.parquet_format.default_fragment_scan_options = options.wrapped
|
||||
else:
|
||||
super()._set_default_fragment_scan_options(options)
|
||||
|
||||
def equals(self, ParquetFileFormat other):
|
||||
return (
|
||||
self.read_options.equals(other.read_options) and
|
||||
self.default_fragment_scan_options ==
|
||||
other.default_fragment_scan_options
|
||||
)
|
||||
|
||||
@property
|
||||
def default_extname(self):
|
||||
return "parquet"
|
||||
|
||||
def __reduce__(self):
|
||||
return ParquetFileFormat, (self.read_options,
|
||||
self.default_fragment_scan_options)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ParquetFileFormat read_options={self.read_options}>"
|
||||
|
||||
def make_fragment(self, file, filesystem=None,
|
||||
Expression partition_expression=None, row_groups=None):
|
||||
cdef:
|
||||
vector[int] c_row_groups
|
||||
|
||||
if partition_expression is None:
|
||||
partition_expression = _true
|
||||
|
||||
if row_groups is None:
|
||||
return super().make_fragment(file, filesystem,
|
||||
partition_expression)
|
||||
|
||||
c_source = _make_file_source(file, filesystem)
|
||||
c_row_groups = [<int> row_group for row_group in set(row_groups)]
|
||||
|
||||
c_fragment = <shared_ptr[CFragment]> GetResultValue(
|
||||
self.parquet_format.MakeFragment(move(c_source),
|
||||
partition_expression.unwrap(),
|
||||
<shared_ptr[CSchema]>nullptr,
|
||||
move(c_row_groups)))
|
||||
return Fragment.wrap(move(c_fragment))
|
||||
|
||||
|
||||
class RowGroupInfo:
|
||||
"""
|
||||
A wrapper class for RowGroup information
|
||||
|
||||
Parameters
|
||||
----------
|
||||
id : integer
|
||||
The group ID.
|
||||
metadata : FileMetaData
|
||||
The rowgroup metadata.
|
||||
schema : Schema
|
||||
Schema of the rows.
|
||||
"""
|
||||
|
||||
def __init__(self, id, metadata, schema):
|
||||
self.id = id
|
||||
self.metadata = metadata
|
||||
self.schema = schema
|
||||
|
||||
@property
|
||||
def num_rows(self):
|
||||
return self.metadata.num_rows
|
||||
|
||||
@property
|
||||
def total_byte_size(self):
|
||||
return self.metadata.total_byte_size
|
||||
|
||||
@property
|
||||
def statistics(self):
|
||||
def name_stats(i):
|
||||
col = self.metadata.column(i)
|
||||
|
||||
stats = col.statistics
|
||||
if stats is None or not stats.has_min_max:
|
||||
return None, None
|
||||
|
||||
name = col.path_in_schema
|
||||
field_index = self.schema.get_field_index(name)
|
||||
if field_index < 0:
|
||||
return None, None
|
||||
|
||||
typ = self.schema.field(field_index).type
|
||||
return col.path_in_schema, {
|
||||
'min': pa.scalar(stats.min, type=typ).as_py(),
|
||||
'max': pa.scalar(stats.max, type=typ).as_py()
|
||||
}
|
||||
|
||||
return {
|
||||
name: stats for name, stats
|
||||
in map(name_stats, range(self.metadata.num_columns))
|
||||
if stats is not None
|
||||
}
|
||||
|
||||
def __repr__(self):
|
||||
return "RowGroupInfo({})".format(self.id)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, int):
|
||||
return self.id == other
|
||||
if not isinstance(other, RowGroupInfo):
|
||||
return False
|
||||
return self.id == other.id
|
||||
|
||||
|
||||
cdef class ParquetFileFragment(FileFragment):
|
||||
"""A Fragment representing a parquet file."""
|
||||
|
||||
cdef:
|
||||
CParquetFileFragment* parquet_file_fragment
|
||||
|
||||
cdef void init(self, const shared_ptr[CFragment]& sp):
|
||||
FileFragment.init(self, sp)
|
||||
self.parquet_file_fragment = <CParquetFileFragment*> sp.get()
|
||||
|
||||
def __reduce__(self):
|
||||
buffer = self.buffer
|
||||
# parquet_file_fragment.row_groups() is empty if the metadata
|
||||
# information of the file is not yet populated
|
||||
if not bool(self.parquet_file_fragment.row_groups()):
|
||||
row_groups = None
|
||||
else:
|
||||
row_groups = [row_group.id for row_group in self.row_groups]
|
||||
|
||||
return self.format.make_fragment, (
|
||||
self.path if buffer is None else buffer,
|
||||
self.filesystem,
|
||||
self.partition_expression,
|
||||
row_groups
|
||||
)
|
||||
|
||||
def ensure_complete_metadata(self):
|
||||
"""
|
||||
Ensure that all metadata (statistics, physical schema, ...) have
|
||||
been read and cached in this fragment.
|
||||
"""
|
||||
with nogil:
|
||||
check_status(self.parquet_file_fragment.EnsureCompleteMetadata())
|
||||
|
||||
@property
|
||||
def row_groups(self):
|
||||
metadata = self.metadata
|
||||
cdef vector[int] row_groups = self.parquet_file_fragment.row_groups()
|
||||
return [RowGroupInfo(i, metadata.row_group(i), self.physical_schema)
|
||||
for i in row_groups]
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
self.ensure_complete_metadata()
|
||||
cdef FileMetaData metadata = FileMetaData()
|
||||
metadata.init(self.parquet_file_fragment.metadata())
|
||||
return metadata
|
||||
|
||||
@property
|
||||
def num_row_groups(self):
|
||||
"""
|
||||
Return the number of row groups viewed by this fragment (not the
|
||||
number of row groups in the origin file).
|
||||
"""
|
||||
self.ensure_complete_metadata()
|
||||
return self.parquet_file_fragment.row_groups().size()
|
||||
|
||||
def split_by_row_group(self, Expression filter=None,
|
||||
Schema schema=None):
|
||||
"""
|
||||
Split the fragment into multiple fragments.
|
||||
|
||||
Yield a Fragment wrapping each row group in this ParquetFileFragment.
|
||||
Row groups will be excluded whose metadata contradicts the optional
|
||||
filter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filter : Expression, default None
|
||||
Only include the row groups which satisfy this predicate (using
|
||||
the Parquet RowGroup statistics).
|
||||
schema : Schema, default None
|
||||
Schema to use when filtering row groups. Defaults to the
|
||||
Fragment's phsyical schema
|
||||
|
||||
Returns
|
||||
-------
|
||||
A list of Fragments
|
||||
"""
|
||||
cdef:
|
||||
vector[shared_ptr[CFragment]] c_fragments
|
||||
CExpression c_filter
|
||||
shared_ptr[CFragment] c_fragment
|
||||
|
||||
schema = schema or self.physical_schema
|
||||
c_filter = _bind(filter, schema)
|
||||
with nogil:
|
||||
c_fragments = move(GetResultValue(
|
||||
self.parquet_file_fragment.SplitByRowGroup(move(c_filter))))
|
||||
|
||||
return [Fragment.wrap(c_fragment) for c_fragment in c_fragments]
|
||||
|
||||
def subset(self, Expression filter=None, Schema schema=None,
|
||||
object row_group_ids=None):
|
||||
"""
|
||||
Create a subset of the fragment (viewing a subset of the row groups).
|
||||
|
||||
Subset can be specified by either a filter predicate (with optional
|
||||
schema) or by a list of row group IDs. Note that when using a filter,
|
||||
the resulting fragment can be empty (viewing no row groups).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
filter : Expression, default None
|
||||
Only include the row groups which satisfy this predicate (using
|
||||
the Parquet RowGroup statistics).
|
||||
schema : Schema, default None
|
||||
Schema to use when filtering row groups. Defaults to the
|
||||
Fragment's phsyical schema
|
||||
row_group_ids : list of ints
|
||||
The row group IDs to include in the subset. Can only be specified
|
||||
if `filter` is None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ParquetFileFragment
|
||||
"""
|
||||
cdef:
|
||||
CExpression c_filter
|
||||
vector[int] c_row_group_ids
|
||||
shared_ptr[CFragment] c_fragment
|
||||
|
||||
if filter is not None and row_group_ids is not None:
|
||||
raise ValueError(
|
||||
"Cannot specify both 'filter' and 'row_group_ids'."
|
||||
)
|
||||
|
||||
if filter is not None:
|
||||
schema = schema or self.physical_schema
|
||||
c_filter = _bind(filter, schema)
|
||||
with nogil:
|
||||
c_fragment = move(GetResultValue(
|
||||
self.parquet_file_fragment.SubsetWithFilter(
|
||||
move(c_filter))))
|
||||
elif row_group_ids is not None:
|
||||
c_row_group_ids = [
|
||||
<int> row_group for row_group in sorted(set(row_group_ids))
|
||||
]
|
||||
with nogil:
|
||||
c_fragment = move(GetResultValue(
|
||||
self.parquet_file_fragment.SubsetWithIds(
|
||||
move(c_row_group_ids))))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Need to specify one of 'filter' or 'row_group_ids'"
|
||||
)
|
||||
|
||||
return Fragment.wrap(c_fragment)
|
||||
|
||||
|
||||
cdef class ParquetReadOptions(_Weakrefable):
|
||||
"""
|
||||
Parquet format specific options for reading.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dictionary_columns : list of string, default None
|
||||
Names of columns which should be dictionary encoded as
|
||||
they are read
|
||||
coerce_int96_timestamp_unit : str, default None
|
||||
Cast timestamps that are stored in INT96 format to a particular
|
||||
resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
|
||||
and therefore INT96 timestamps will be inferred as timestamps
|
||||
in nanoseconds
|
||||
"""
|
||||
|
||||
cdef public:
|
||||
set dictionary_columns
|
||||
TimeUnit _coerce_int96_timestamp_unit
|
||||
|
||||
# Also see _PARQUET_READ_OPTIONS
|
||||
def __init__(self, dictionary_columns=None,
|
||||
coerce_int96_timestamp_unit=None):
|
||||
self.dictionary_columns = set(dictionary_columns or set())
|
||||
self.coerce_int96_timestamp_unit = coerce_int96_timestamp_unit
|
||||
|
||||
@property
|
||||
def coerce_int96_timestamp_unit(self):
|
||||
return timeunit_to_string(self._coerce_int96_timestamp_unit)
|
||||
|
||||
@coerce_int96_timestamp_unit.setter
|
||||
def coerce_int96_timestamp_unit(self, unit):
|
||||
if unit is not None:
|
||||
self._coerce_int96_timestamp_unit = string_to_timeunit(unit)
|
||||
else:
|
||||
self._coerce_int96_timestamp_unit = TimeUnit_NANO
|
||||
|
||||
def equals(self, ParquetReadOptions other):
|
||||
return (self.dictionary_columns == other.dictionary_columns and
|
||||
self.coerce_int96_timestamp_unit ==
|
||||
other.coerce_int96_timestamp_unit)
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
return self.equals(other)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"<ParquetReadOptions"
|
||||
f" dictionary_columns={self.dictionary_columns}"
|
||||
f" coerce_int96_timestamp_unit={self.coerce_int96_timestamp_unit}>"
|
||||
)
|
||||
|
||||
|
||||
cdef class ParquetFileWriteOptions(FileWriteOptions):
|
||||
|
||||
cdef:
|
||||
CParquetFileWriteOptions* parquet_options
|
||||
object _properties
|
||||
|
||||
def update(self, **kwargs):
|
||||
arrow_fields = {
|
||||
"use_deprecated_int96_timestamps",
|
||||
"coerce_timestamps",
|
||||
"allow_truncated_timestamps",
|
||||
}
|
||||
|
||||
setters = set()
|
||||
for name, value in kwargs.items():
|
||||
if name not in self._properties:
|
||||
raise TypeError("unexpected parquet write option: " + name)
|
||||
self._properties[name] = value
|
||||
if name in arrow_fields:
|
||||
setters.add(self._set_arrow_properties)
|
||||
else:
|
||||
setters.add(self._set_properties)
|
||||
|
||||
for setter in setters:
|
||||
setter()
|
||||
|
||||
def _set_properties(self):
|
||||
cdef CParquetFileWriteOptions* opts = self.parquet_options
|
||||
|
||||
opts.writer_properties = _create_writer_properties(
|
||||
use_dictionary=self._properties["use_dictionary"],
|
||||
compression=self._properties["compression"],
|
||||
version=self._properties["version"],
|
||||
write_statistics=self._properties["write_statistics"],
|
||||
data_page_size=self._properties["data_page_size"],
|
||||
compression_level=self._properties["compression_level"],
|
||||
use_byte_stream_split=(
|
||||
self._properties["use_byte_stream_split"]
|
||||
),
|
||||
column_encoding=self._properties["column_encoding"],
|
||||
data_page_version=self._properties["data_page_version"],
|
||||
)
|
||||
|
||||
def _set_arrow_properties(self):
|
||||
cdef CParquetFileWriteOptions* opts = self.parquet_options
|
||||
|
||||
opts.arrow_writer_properties = _create_arrow_writer_properties(
|
||||
use_deprecated_int96_timestamps=(
|
||||
self._properties["use_deprecated_int96_timestamps"]
|
||||
),
|
||||
coerce_timestamps=self._properties["coerce_timestamps"],
|
||||
allow_truncated_timestamps=(
|
||||
self._properties["allow_truncated_timestamps"]
|
||||
),
|
||||
writer_engine_version="V2",
|
||||
use_compliant_nested_type=(
|
||||
self._properties["use_compliant_nested_type"]
|
||||
)
|
||||
)
|
||||
|
||||
cdef void init(self, const shared_ptr[CFileWriteOptions]& sp):
|
||||
FileWriteOptions.init(self, sp)
|
||||
self.parquet_options = <CParquetFileWriteOptions*> sp.get()
|
||||
self._properties = dict(
|
||||
use_dictionary=True,
|
||||
compression="snappy",
|
||||
version="1.0",
|
||||
write_statistics=None,
|
||||
data_page_size=None,
|
||||
compression_level=None,
|
||||
use_byte_stream_split=False,
|
||||
column_encoding=None,
|
||||
data_page_version="1.0",
|
||||
use_deprecated_int96_timestamps=False,
|
||||
coerce_timestamps=None,
|
||||
allow_truncated_timestamps=False,
|
||||
use_compliant_nested_type=False,
|
||||
)
|
||||
self._set_properties()
|
||||
self._set_arrow_properties()
|
||||
|
||||
|
||||
cdef set _PARQUET_READ_OPTIONS = {
|
||||
'dictionary_columns', 'coerce_int96_timestamp_unit'
|
||||
}
|
||||
|
||||
|
||||
cdef class ParquetFragmentScanOptions(FragmentScanOptions):
|
||||
"""
|
||||
Scan-specific options for Parquet fragments.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_buffered_stream : bool, default False
|
||||
Read files through buffered input streams rather than loading entire
|
||||
row groups at once. This may be enabled to reduce memory overhead.
|
||||
Disabled by default.
|
||||
buffer_size : int, default 8192
|
||||
Size of buffered stream, if enabled. Default is 8KB.
|
||||
pre_buffer : bool, default False
|
||||
If enabled, pre-buffer the raw Parquet data instead of issuing one
|
||||
read per column chunk. This can improve performance on high-latency
|
||||
filesystems.
|
||||
thrift_string_size_limit : int, default None
|
||||
If not None, override the maximum total string size allocated
|
||||
when decoding Thrift structures. The default limit should be
|
||||
sufficient for most Parquet files.
|
||||
thrift_container_size_limit : int, default None
|
||||
If not None, override the maximum total size of containers allocated
|
||||
when decoding Thrift structures. The default limit should be
|
||||
sufficient for most Parquet files.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CParquetFragmentScanOptions* parquet_options
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, *, bint use_buffered_stream=False,
|
||||
buffer_size=8192,
|
||||
bint pre_buffer=False,
|
||||
thrift_string_size_limit=None,
|
||||
thrift_container_size_limit=None):
|
||||
self.init(shared_ptr[CFragmentScanOptions](
|
||||
new CParquetFragmentScanOptions()))
|
||||
self.use_buffered_stream = use_buffered_stream
|
||||
self.buffer_size = buffer_size
|
||||
self.pre_buffer = pre_buffer
|
||||
if thrift_string_size_limit is not None:
|
||||
self.thrift_string_size_limit = thrift_string_size_limit
|
||||
if thrift_container_size_limit is not None:
|
||||
self.thrift_container_size_limit = thrift_container_size_limit
|
||||
|
||||
cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
|
||||
FragmentScanOptions.init(self, sp)
|
||||
self.parquet_options = <CParquetFragmentScanOptions*> sp.get()
|
||||
|
||||
cdef CReaderProperties* reader_properties(self):
|
||||
return self.parquet_options.reader_properties.get()
|
||||
|
||||
cdef ArrowReaderProperties* arrow_reader_properties(self):
|
||||
return self.parquet_options.arrow_reader_properties.get()
|
||||
|
||||
@property
|
||||
def use_buffered_stream(self):
|
||||
return self.reader_properties().is_buffered_stream_enabled()
|
||||
|
||||
@use_buffered_stream.setter
|
||||
def use_buffered_stream(self, bint use_buffered_stream):
|
||||
if use_buffered_stream:
|
||||
self.reader_properties().enable_buffered_stream()
|
||||
else:
|
||||
self.reader_properties().disable_buffered_stream()
|
||||
|
||||
@property
|
||||
def buffer_size(self):
|
||||
return self.reader_properties().buffer_size()
|
||||
|
||||
@buffer_size.setter
|
||||
def buffer_size(self, buffer_size):
|
||||
if buffer_size <= 0:
|
||||
raise ValueError("Buffer size must be larger than zero")
|
||||
self.reader_properties().set_buffer_size(buffer_size)
|
||||
|
||||
@property
|
||||
def pre_buffer(self):
|
||||
return self.arrow_reader_properties().pre_buffer()
|
||||
|
||||
@pre_buffer.setter
|
||||
def pre_buffer(self, bint pre_buffer):
|
||||
self.arrow_reader_properties().set_pre_buffer(pre_buffer)
|
||||
|
||||
@property
|
||||
def thrift_string_size_limit(self):
|
||||
return self.reader_properties().thrift_string_size_limit()
|
||||
|
||||
@thrift_string_size_limit.setter
|
||||
def thrift_string_size_limit(self, size):
|
||||
if size <= 0:
|
||||
raise ValueError("size must be larger than zero")
|
||||
self.reader_properties().set_thrift_string_size_limit(size)
|
||||
|
||||
@property
|
||||
def thrift_container_size_limit(self):
|
||||
return self.reader_properties().thrift_container_size_limit()
|
||||
|
||||
@thrift_container_size_limit.setter
|
||||
def thrift_container_size_limit(self, size):
|
||||
if size <= 0:
|
||||
raise ValueError("size must be larger than zero")
|
||||
self.reader_properties().set_thrift_container_size_limit(size)
|
||||
|
||||
def equals(self, ParquetFragmentScanOptions other):
|
||||
attrs = (
|
||||
self.use_buffered_stream, self.buffer_size, self.pre_buffer,
|
||||
self.thrift_string_size_limit, self.thrift_container_size_limit)
|
||||
other_attrs = (
|
||||
other.use_buffered_stream, other.buffer_size, other.pre_buffer,
|
||||
other.thrift_string_size_limit,
|
||||
other.thrift_container_size_limit)
|
||||
return attrs == other_attrs
|
||||
|
||||
@classmethod
|
||||
def _reconstruct(cls, kwargs):
|
||||
return cls(**kwargs)
|
||||
|
||||
def __reduce__(self):
|
||||
kwargs = dict(
|
||||
use_buffered_stream=self.use_buffered_stream,
|
||||
buffer_size=self.buffer_size,
|
||||
pre_buffer=self.pre_buffer,
|
||||
thrift_string_size_limit=self.thrift_string_size_limit,
|
||||
thrift_container_size_limit=self.thrift_container_size_limit,
|
||||
)
|
||||
return type(self)._reconstruct, (kwargs,)
|
||||
|
||||
|
||||
cdef class ParquetFactoryOptions(_Weakrefable):
|
||||
"""
|
||||
Influences the discovery of parquet dataset.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
partition_base_dir : str, optional
|
||||
For the purposes of applying the partitioning, paths will be
|
||||
stripped of the partition_base_dir. Files not matching the
|
||||
partition_base_dir prefix will be skipped for partitioning discovery.
|
||||
The ignored files will still be part of the Dataset, but will not
|
||||
have partition information.
|
||||
partitioning : Partitioning, PartitioningFactory, optional
|
||||
The partitioning scheme applied to fragments, see ``Partitioning``.
|
||||
validate_column_chunk_paths : bool, default False
|
||||
Assert that all ColumnChunk paths are consistent. The parquet spec
|
||||
allows for ColumnChunk data to be stored in multiple files, but
|
||||
ParquetDatasetFactory supports only a single file with all ColumnChunk
|
||||
data. If this flag is set construction of a ParquetDatasetFactory will
|
||||
raise an error if ColumnChunk data is not resident in a single file.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CParquetFactoryOptions options
|
||||
|
||||
__slots__ = () # avoid mistakingly creating attributes
|
||||
|
||||
def __init__(self, partition_base_dir=None, partitioning=None,
|
||||
validate_column_chunk_paths=False):
|
||||
if isinstance(partitioning, PartitioningFactory):
|
||||
self.partitioning_factory = partitioning
|
||||
elif isinstance(partitioning, Partitioning):
|
||||
self.partitioning = partitioning
|
||||
|
||||
if partition_base_dir is not None:
|
||||
self.partition_base_dir = partition_base_dir
|
||||
|
||||
self.options.validate_column_chunk_paths = validate_column_chunk_paths
|
||||
|
||||
cdef inline CParquetFactoryOptions unwrap(self):
|
||||
return self.options
|
||||
|
||||
@property
|
||||
def partitioning(self):
|
||||
"""Partitioning to apply to discovered files.
|
||||
|
||||
NOTE: setting this property will overwrite partitioning_factory.
|
||||
"""
|
||||
c_partitioning = self.options.partitioning.partitioning()
|
||||
if c_partitioning.get() == nullptr:
|
||||
return None
|
||||
return Partitioning.wrap(c_partitioning)
|
||||
|
||||
@partitioning.setter
|
||||
def partitioning(self, Partitioning value):
|
||||
self.options.partitioning = (<Partitioning> value).unwrap()
|
||||
|
||||
@property
|
||||
def partitioning_factory(self):
|
||||
"""PartitioningFactory to apply to discovered files and
|
||||
discover a Partitioning.
|
||||
|
||||
NOTE: setting this property will overwrite partitioning.
|
||||
"""
|
||||
c_factory = self.options.partitioning.factory()
|
||||
if c_factory.get() == nullptr:
|
||||
return None
|
||||
return PartitioningFactory.wrap(c_factory)
|
||||
|
||||
@partitioning_factory.setter
|
||||
def partitioning_factory(self, PartitioningFactory value):
|
||||
self.options.partitioning = (<PartitioningFactory> value).unwrap()
|
||||
|
||||
@property
|
||||
def partition_base_dir(self):
|
||||
"""
|
||||
Base directory to strip paths before applying the partitioning.
|
||||
"""
|
||||
return frombytes(self.options.partition_base_dir)
|
||||
|
||||
@partition_base_dir.setter
|
||||
def partition_base_dir(self, value):
|
||||
self.options.partition_base_dir = tobytes(value)
|
||||
|
||||
@property
|
||||
def validate_column_chunk_paths(self):
|
||||
"""
|
||||
Base directory to strip paths before applying the partitioning.
|
||||
"""
|
||||
return self.options.validate_column_chunk_paths
|
||||
|
||||
@validate_column_chunk_paths.setter
|
||||
def validate_column_chunk_paths(self, value):
|
||||
self.options.validate_column_chunk_paths = value
|
||||
|
||||
|
||||
cdef class ParquetDatasetFactory(DatasetFactory):
|
||||
"""
|
||||
Create a ParquetDatasetFactory from a Parquet `_metadata` file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metadata_path : str
|
||||
Path to the `_metadata` parquet metadata-only file generated with
|
||||
`pyarrow.parquet.write_metadata`.
|
||||
filesystem : pyarrow.fs.FileSystem
|
||||
Filesystem to read the metadata_path from, and subsequent parquet
|
||||
files.
|
||||
format : ParquetFileFormat
|
||||
Parquet format options.
|
||||
options : ParquetFactoryOptions, optional
|
||||
Various flags influencing the discovery of filesystem paths.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CParquetDatasetFactory* parquet_factory
|
||||
|
||||
def __init__(self, metadata_path, FileSystem filesystem not None,
|
||||
FileFormat format not None,
|
||||
ParquetFactoryOptions options=None):
|
||||
cdef:
|
||||
c_string c_path
|
||||
shared_ptr[CFileSystem] c_filesystem
|
||||
shared_ptr[CParquetFileFormat] c_format
|
||||
CResult[shared_ptr[CDatasetFactory]] result
|
||||
CParquetFactoryOptions c_options
|
||||
|
||||
c_path = tobytes(metadata_path)
|
||||
c_filesystem = filesystem.unwrap()
|
||||
c_format = static_pointer_cast[CParquetFileFormat, CFileFormat](
|
||||
format.unwrap())
|
||||
options = options or ParquetFactoryOptions()
|
||||
c_options = options.unwrap()
|
||||
|
||||
with nogil:
|
||||
result = CParquetDatasetFactory.MakeFromMetaDataPath(
|
||||
c_path, c_filesystem, c_format, c_options)
|
||||
self.init(GetResultValue(result))
|
||||
|
||||
cdef init(self, shared_ptr[CDatasetFactory]& sp):
|
||||
DatasetFactory.init(self, sp)
|
||||
self.parquet_factory = <CParquetDatasetFactory*> sp.get()
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_exec_plan.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_exec_plan.cpython-39-darwin.so
Executable file
Binary file not shown.
452
venv/lib/python3.9/site-packages/pyarrow/_exec_plan.pyx
Normal file
452
venv/lib/python3.9/site-packages/pyarrow/_exec_plan.pyx
Normal file
@@ -0,0 +1,452 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Implement Internal ExecPlan bindings
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from cython.operator cimport dereference as deref, preincrement as inc
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_dataset cimport *
|
||||
from pyarrow.lib cimport (Table, check_status, pyarrow_unwrap_table, pyarrow_wrap_table,
|
||||
RecordBatchReader)
|
||||
from pyarrow.lib import tobytes
|
||||
from pyarrow._compute cimport Expression, _true, _SortOptions
|
||||
from pyarrow._dataset cimport Dataset, Scanner
|
||||
from pyarrow._dataset import InMemoryDataset
|
||||
|
||||
Initialize() # Initialise support for Datasets in ExecPlan
|
||||
|
||||
|
||||
cdef execplan(inputs, output_type, vector[CDeclaration] plan, c_bool use_threads=True,
|
||||
_SortOptions sort_options=None):
|
||||
"""
|
||||
Internal Function to create an ExecPlan and run it.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inputs : list of Table or Dataset
|
||||
The sources from which the ExecPlan should fetch data.
|
||||
In most cases this is only one, unless the first node of the
|
||||
plan is able to get data from multiple different sources.
|
||||
output_type : Table or InMemoryDataset
|
||||
In which format the output should be provided.
|
||||
plan : vector[CDeclaration]
|
||||
The nodes of the plan that should be applied to the sources
|
||||
to produce the output.
|
||||
use_threads : bool, default True
|
||||
Whether to use multithreading or not.
|
||||
"""
|
||||
cdef:
|
||||
CExecutor *c_executor
|
||||
shared_ptr[CExecContext] c_exec_context
|
||||
shared_ptr[CExecPlan] c_exec_plan
|
||||
CDeclaration current_decl
|
||||
vector[CDeclaration] c_decls
|
||||
vector[CExecNode*] _empty
|
||||
vector[CExecNode*] c_final_node_vec
|
||||
CExecNode *c_node
|
||||
CTable* c_table
|
||||
shared_ptr[CTable] c_in_table
|
||||
shared_ptr[CTable] c_out_table
|
||||
shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
|
||||
shared_ptr[CScanner] c_dataset_scanner
|
||||
shared_ptr[CScanNodeOptions] c_scanopts
|
||||
shared_ptr[CExecNodeOptions] c_input_node_opts
|
||||
shared_ptr[CSinkNodeOptions] c_sinkopts
|
||||
shared_ptr[COrderBySinkNodeOptions] c_orderbysinkopts
|
||||
shared_ptr[CAsyncExecBatchGenerator] c_async_exec_batch_gen
|
||||
shared_ptr[CRecordBatchReader] c_recordbatchreader
|
||||
shared_ptr[CRecordBatchReader] c_recordbatchreader_in
|
||||
vector[CDeclaration].iterator plan_iter
|
||||
vector[CDeclaration.Input] no_c_inputs
|
||||
CStatus c_plan_status
|
||||
|
||||
if use_threads:
|
||||
c_executor = GetCpuThreadPool()
|
||||
else:
|
||||
c_executor = NULL
|
||||
|
||||
# TODO(weston): This is deprecated. Once ordering is better supported
|
||||
# in the exec plan we can remove all references to ExecPlan and use the
|
||||
# DeclarationToXyz methods
|
||||
c_exec_context = make_shared[CExecContext](
|
||||
c_default_memory_pool(), c_executor)
|
||||
c_exec_plan = GetResultValue(CExecPlan.Make(c_exec_context.get()))
|
||||
|
||||
plan_iter = plan.begin()
|
||||
|
||||
# Create source nodes for each input
|
||||
for ipt in inputs:
|
||||
if isinstance(ipt, Table):
|
||||
c_in_table = pyarrow_unwrap_table(ipt)
|
||||
c_tablesourceopts = make_shared[CTableSourceNodeOptions](
|
||||
c_in_table)
|
||||
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
|
||||
c_tablesourceopts)
|
||||
|
||||
current_decl = CDeclaration(
|
||||
tobytes("table_source"), no_c_inputs, c_input_node_opts)
|
||||
elif isinstance(ipt, Dataset):
|
||||
c_in_dataset = (<Dataset>ipt).unwrap()
|
||||
c_scanopts = make_shared[CScanNodeOptions](
|
||||
c_in_dataset, Scanner._make_scan_options(ipt, {"use_threads": use_threads}))
|
||||
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CScanNodeOptions](
|
||||
c_scanopts)
|
||||
|
||||
# Filters applied in CScanNodeOptions are "best effort" for the scan node itself,
|
||||
# so we always need to inject an additional Filter node to apply them for real.
|
||||
current_decl = CDeclaration(
|
||||
tobytes("filter"),
|
||||
no_c_inputs,
|
||||
static_pointer_cast[CExecNodeOptions, CFilterNodeOptions](
|
||||
make_shared[CFilterNodeOptions](
|
||||
deref(deref(c_scanopts).scan_options).filter
|
||||
)
|
||||
)
|
||||
)
|
||||
current_decl.inputs.push_back(
|
||||
CDeclaration.Input(
|
||||
CDeclaration(tobytes("scan"), no_c_inputs, c_input_node_opts))
|
||||
)
|
||||
else:
|
||||
raise TypeError("Unsupported type")
|
||||
|
||||
if plan_iter != plan.end():
|
||||
# Flag the source as the input of the first plan node.
|
||||
deref(plan_iter).inputs.push_back(CDeclaration.Input(current_decl))
|
||||
else:
|
||||
# Empty plan, make the source the first plan node.
|
||||
c_decls.push_back(current_decl)
|
||||
|
||||
# Add Here additional nodes
|
||||
while plan_iter != plan.end():
|
||||
c_decls.push_back(deref(plan_iter))
|
||||
inc(plan_iter)
|
||||
|
||||
# Add all CDeclarations to the plan
|
||||
c_node = GetResultValue(
|
||||
CDeclaration.Sequence(c_decls).AddToPlan(&deref(c_exec_plan))
|
||||
)
|
||||
c_final_node_vec.push_back(c_node)
|
||||
|
||||
# Create the output node
|
||||
c_async_exec_batch_gen = make_shared[CAsyncExecBatchGenerator]()
|
||||
|
||||
if sort_options is None:
|
||||
c_sinkopts = make_shared[CSinkNodeOptions](
|
||||
c_async_exec_batch_gen.get())
|
||||
GetResultValue(
|
||||
MakeExecNode(tobytes("sink"), &deref(c_exec_plan),
|
||||
c_final_node_vec, deref(c_sinkopts))
|
||||
)
|
||||
else:
|
||||
c_orderbysinkopts = make_shared[COrderBySinkNodeOptions](
|
||||
deref(<CSortOptions*>(sort_options.unwrap().get())),
|
||||
c_async_exec_batch_gen.get()
|
||||
)
|
||||
GetResultValue(
|
||||
MakeExecNode(tobytes("order_by_sink"), &deref(c_exec_plan),
|
||||
c_final_node_vec, deref(c_orderbysinkopts))
|
||||
)
|
||||
|
||||
# Convert the asyncgenerator to a sync batch reader
|
||||
c_recordbatchreader = MakeGeneratorReader(c_node.output_schema(),
|
||||
deref(c_async_exec_batch_gen),
|
||||
deref(c_exec_context).memory_pool())
|
||||
|
||||
# Start execution of the ExecPlan
|
||||
deref(c_exec_plan).Validate()
|
||||
deref(c_exec_plan).StartProducing()
|
||||
|
||||
# Convert output to the expected one.
|
||||
c_out_table = GetResultValue(
|
||||
CTable.FromRecordBatchReader(c_recordbatchreader.get()))
|
||||
if output_type == Table:
|
||||
output = pyarrow_wrap_table(c_out_table)
|
||||
elif output_type == InMemoryDataset:
|
||||
output = InMemoryDataset(pyarrow_wrap_table(c_out_table))
|
||||
else:
|
||||
raise TypeError("Unsupported output type")
|
||||
|
||||
with nogil:
|
||||
c_plan_status = deref(c_exec_plan).finished().status()
|
||||
check_status(c_plan_status)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def _perform_join(join_type, left_operand not None, left_keys,
|
||||
right_operand not None, right_keys,
|
||||
left_suffix=None, right_suffix=None,
|
||||
use_threads=True, coalesce_keys=False,
|
||||
output_type=Table):
|
||||
"""
|
||||
Perform join of two tables or datasets.
|
||||
|
||||
The result will be an output table with the result of the join operation
|
||||
|
||||
Parameters
|
||||
----------
|
||||
join_type : str
|
||||
One of supported join types.
|
||||
left_operand : Table or Dataset
|
||||
The left operand for the join operation.
|
||||
left_keys : str or list[str]
|
||||
The left key (or keys) on which the join operation should be performed.
|
||||
right_operand : Table or Dataset
|
||||
The right operand for the join operation.
|
||||
right_keys : str or list[str]
|
||||
The right key (or keys) on which the join operation should be performed.
|
||||
left_suffix : str, default None
|
||||
Which suffix to add to left column names. This prevents confusion
|
||||
when the columns in left and right operands have colliding names.
|
||||
right_suffix : str, default None
|
||||
Which suffix to add to the right column names. This prevents confusion
|
||||
when the columns in left and right operands have colliding names.
|
||||
use_threads : bool, default True
|
||||
Whether to use multithreading or not.
|
||||
coalesce_keys : bool, default False
|
||||
If the duplicated keys should be omitted from one of the sides
|
||||
in the join result.
|
||||
output_type: Table or InMemoryDataset
|
||||
The output type for the exec plan result.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result_table : Table or InMemoryDataset
|
||||
"""
|
||||
cdef:
|
||||
vector[CFieldRef] c_left_keys
|
||||
vector[CFieldRef] c_right_keys
|
||||
vector[CFieldRef] c_left_columns
|
||||
vector[CFieldRef] c_right_columns
|
||||
vector[CDeclaration] c_decl_plan
|
||||
vector[CExpression] c_projections
|
||||
vector[c_string] c_projected_col_names
|
||||
CJoinType c_join_type
|
||||
|
||||
# Prepare left and right tables Keys to send them to the C++ function
|
||||
left_keys_order = {}
|
||||
if isinstance(left_keys, str):
|
||||
left_keys = [left_keys]
|
||||
for idx, key in enumerate(left_keys):
|
||||
left_keys_order[key] = idx
|
||||
c_left_keys.push_back(CFieldRef(<c_string>tobytes(key)))
|
||||
|
||||
right_keys_order = {}
|
||||
if isinstance(right_keys, str):
|
||||
right_keys = [right_keys]
|
||||
for idx, key in enumerate(right_keys):
|
||||
right_keys_order[key] = idx
|
||||
c_right_keys.push_back(CFieldRef(<c_string>tobytes(key)))
|
||||
|
||||
# By default expose all columns on both left and right table
|
||||
if isinstance(left_operand, Table):
|
||||
left_columns = left_operand.column_names
|
||||
elif isinstance(left_operand, Dataset):
|
||||
left_columns = left_operand.schema.names
|
||||
else:
|
||||
raise TypeError("Unsupported left join member type")
|
||||
|
||||
if isinstance(right_operand, Table):
|
||||
right_columns = right_operand.column_names
|
||||
elif isinstance(right_operand, Dataset):
|
||||
right_columns = right_operand.schema.names
|
||||
else:
|
||||
raise TypeError("Unsupported right join member type")
|
||||
|
||||
# Pick the join type
|
||||
if join_type == "left semi":
|
||||
c_join_type = CJoinType_LEFT_SEMI
|
||||
right_columns = []
|
||||
elif join_type == "right semi":
|
||||
c_join_type = CJoinType_RIGHT_SEMI
|
||||
left_columns = []
|
||||
elif join_type == "left anti":
|
||||
c_join_type = CJoinType_LEFT_ANTI
|
||||
right_columns = []
|
||||
elif join_type == "right anti":
|
||||
c_join_type = CJoinType_RIGHT_ANTI
|
||||
left_columns = []
|
||||
elif join_type == "inner":
|
||||
c_join_type = CJoinType_INNER
|
||||
right_columns = [
|
||||
col for col in right_columns if col not in right_keys_order
|
||||
]
|
||||
elif join_type == "left outer":
|
||||
c_join_type = CJoinType_LEFT_OUTER
|
||||
right_columns = [
|
||||
col for col in right_columns if col not in right_keys_order
|
||||
]
|
||||
elif join_type == "right outer":
|
||||
c_join_type = CJoinType_RIGHT_OUTER
|
||||
left_columns = [
|
||||
col for col in left_columns if col not in left_keys_order
|
||||
]
|
||||
elif join_type == "full outer":
|
||||
c_join_type = CJoinType_FULL_OUTER
|
||||
else:
|
||||
raise ValueError("Unsupported join type")
|
||||
|
||||
# Turn the columns to vectors of FieldRefs
|
||||
# and set aside indices of keys.
|
||||
left_column_keys_indices = {}
|
||||
for idx, colname in enumerate(left_columns):
|
||||
c_left_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
|
||||
if colname in left_keys:
|
||||
left_column_keys_indices[colname] = idx
|
||||
right_column_keys_indices = {}
|
||||
for idx, colname in enumerate(right_columns):
|
||||
c_right_columns.push_back(CFieldRef(<c_string>tobytes(colname)))
|
||||
if colname in right_keys:
|
||||
right_column_keys_indices[colname] = idx
|
||||
|
||||
# Add the join node to the execplan
|
||||
if coalesce_keys:
|
||||
c_decl_plan.push_back(
|
||||
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
|
||||
c_join_type, c_left_keys, c_right_keys,
|
||||
c_left_columns, c_right_columns,
|
||||
_true,
|
||||
<c_string>tobytes(left_suffix or ""),
|
||||
<c_string>tobytes(right_suffix or "")
|
||||
))
|
||||
)
|
||||
if join_type == "full outer":
|
||||
# In case of full outer joins, the join operation will output all columns
|
||||
# so that we can coalesce the keys and exclude duplicates in a subsequent projection.
|
||||
left_columns_set = set(left_columns)
|
||||
right_columns_set = set(right_columns)
|
||||
# Where the right table columns start.
|
||||
right_operand_index = len(left_columns)
|
||||
for idx, col in enumerate(left_columns + right_columns):
|
||||
if idx < len(left_columns) and col in left_column_keys_indices:
|
||||
# Include keys only once and coalesce left+right table keys.
|
||||
c_projected_col_names.push_back(tobytes(col))
|
||||
# Get the index of the right key that is being paired
|
||||
# with this left key. We do so by retrieving the name
|
||||
# of the right key that is in the same position in the provided keys
|
||||
# and then looking up the index for that name in the right table.
|
||||
right_key_index = right_column_keys_indices[right_keys[left_keys_order[col]]]
|
||||
c_projections.push_back(Expression.unwrap(
|
||||
Expression._call("coalesce", [
|
||||
Expression._field(idx), Expression._field(
|
||||
right_operand_index+right_key_index)
|
||||
])
|
||||
))
|
||||
elif idx >= right_operand_index and col in right_column_keys_indices:
|
||||
# Do not include right table keys. As they would lead to duplicated keys.
|
||||
continue
|
||||
else:
|
||||
# For all the other columns incude them as they are.
|
||||
# Just recompute the suffixes that the join produced as the projection
|
||||
# would lose them otherwise.
|
||||
if left_suffix and idx < right_operand_index and col in right_columns_set:
|
||||
col += left_suffix
|
||||
if right_suffix and idx >= right_operand_index and col in left_columns_set:
|
||||
col += right_suffix
|
||||
c_projected_col_names.push_back(tobytes(col))
|
||||
c_projections.push_back(
|
||||
Expression.unwrap(Expression._field(idx)))
|
||||
c_decl_plan.push_back(
|
||||
CDeclaration(tobytes("project"), CProjectNodeOptions(
|
||||
c_projections, c_projected_col_names))
|
||||
)
|
||||
else:
|
||||
c_decl_plan.push_back(
|
||||
CDeclaration(tobytes("hashjoin"), CHashJoinNodeOptions(
|
||||
c_join_type, c_left_keys, c_right_keys,
|
||||
_true,
|
||||
<c_string>tobytes(left_suffix or ""),
|
||||
<c_string>tobytes(right_suffix or "")
|
||||
))
|
||||
)
|
||||
|
||||
result_table = execplan([left_operand, right_operand],
|
||||
plan=c_decl_plan,
|
||||
output_type=output_type,
|
||||
use_threads=use_threads)
|
||||
|
||||
return result_table
|
||||
|
||||
|
||||
def _filter_table(table, expression, output_type=Table):
|
||||
"""Filter rows of a table or dataset based on the provided expression.
|
||||
|
||||
The result will be an output table with only the rows matching
|
||||
the provided expression.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : Table or Dataset
|
||||
Table or Dataset that should be filtered.
|
||||
expression : Expression
|
||||
The expression on which rows should be filtered.
|
||||
output_type: Table or InMemoryDataset
|
||||
The output type for the filtered result.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result_table : Table or InMemoryDataset
|
||||
"""
|
||||
cdef:
|
||||
vector[CDeclaration] c_decl_plan
|
||||
Expression expr = expression
|
||||
|
||||
c_decl_plan.push_back(
|
||||
CDeclaration(tobytes("filter"), CFilterNodeOptions(
|
||||
<CExpression>expr.unwrap()
|
||||
))
|
||||
)
|
||||
|
||||
r = execplan([table], plan=c_decl_plan,
|
||||
output_type=Table, use_threads=False)
|
||||
|
||||
if output_type == Table:
|
||||
return r
|
||||
elif output_type == InMemoryDataset:
|
||||
# Get rid of special dataset columns
|
||||
# "__fragment_index", "__batch_index", "__last_in_fragment", "__filename"
|
||||
return InMemoryDataset(r.select(table.schema.names))
|
||||
else:
|
||||
raise TypeError("Unsupported output type")
|
||||
|
||||
|
||||
def _sort_source(table_or_dataset, sort_options, output_type=Table):
|
||||
cdef:
|
||||
vector[CDeclaration] c_empty_decl_plan
|
||||
|
||||
r = execplan([table_or_dataset],
|
||||
plan=c_empty_decl_plan,
|
||||
output_type=Table,
|
||||
use_threads=True,
|
||||
sort_options=sort_options)
|
||||
|
||||
if output_type == Table:
|
||||
return r
|
||||
elif output_type == InMemoryDataset:
|
||||
# Get rid of special dataset columns
|
||||
# "__fragment_index", "__batch_index", "__last_in_fragment", "__filename"
|
||||
return InMemoryDataset(r.select(table_or_dataset.schema.names))
|
||||
else:
|
||||
raise TypeError("Unsupported output type")
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_feather.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_feather.cpython-39-darwin.so
Executable file
Binary file not shown.
117
venv/lib/python3.9/site-packages/pyarrow/_feather.pyx
Normal file
117
venv/lib/python3.9/site-packages/pyarrow/_feather.pyx
Normal file
@@ -0,0 +1,117 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Implement Feather file format
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: language_level=3
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_feather cimport *
|
||||
from pyarrow.lib cimport (check_status, Table, _Weakrefable,
|
||||
get_writer, get_reader, pyarrow_wrap_table)
|
||||
from pyarrow.lib import tobytes
|
||||
|
||||
|
||||
class FeatherError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def write_feather(Table table, object dest, compression=None,
|
||||
compression_level=None, chunksize=None, version=2):
|
||||
cdef shared_ptr[COutputStream] sink
|
||||
get_writer(dest, &sink)
|
||||
|
||||
cdef CFeatherProperties properties
|
||||
if version == 2:
|
||||
properties.version = kFeatherV2Version
|
||||
else:
|
||||
properties.version = kFeatherV1Version
|
||||
|
||||
if compression == 'zstd':
|
||||
properties.compression = CCompressionType_ZSTD
|
||||
elif compression == 'lz4':
|
||||
properties.compression = CCompressionType_LZ4_FRAME
|
||||
else:
|
||||
properties.compression = CCompressionType_UNCOMPRESSED
|
||||
|
||||
if chunksize is not None:
|
||||
properties.chunksize = chunksize
|
||||
|
||||
if compression_level is not None:
|
||||
properties.compression_level = compression_level
|
||||
|
||||
with nogil:
|
||||
check_status(WriteFeather(deref(table.table), sink.get(),
|
||||
properties))
|
||||
|
||||
|
||||
cdef class FeatherReader(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CFeatherReader] reader
|
||||
|
||||
def __cinit__(self, source, c_bool use_memory_map, c_bool use_threads):
|
||||
cdef:
|
||||
shared_ptr[CRandomAccessFile] reader
|
||||
CIpcReadOptions options = CIpcReadOptions.Defaults()
|
||||
options.use_threads = use_threads
|
||||
|
||||
get_reader(source, use_memory_map, &reader)
|
||||
with nogil:
|
||||
self.reader = GetResultValue(CFeatherReader.Open(reader, options))
|
||||
|
||||
@property
|
||||
def version(self):
|
||||
return self.reader.get().version()
|
||||
|
||||
def read(self):
|
||||
cdef shared_ptr[CTable] sp_table
|
||||
with nogil:
|
||||
check_status(self.reader.get()
|
||||
.Read(&sp_table))
|
||||
|
||||
return pyarrow_wrap_table(sp_table)
|
||||
|
||||
def read_indices(self, indices):
|
||||
cdef:
|
||||
shared_ptr[CTable] sp_table
|
||||
vector[int] c_indices
|
||||
|
||||
for index in indices:
|
||||
c_indices.push_back(index)
|
||||
with nogil:
|
||||
check_status(self.reader.get()
|
||||
.Read(c_indices, &sp_table))
|
||||
|
||||
return pyarrow_wrap_table(sp_table)
|
||||
|
||||
def read_names(self, names):
|
||||
cdef:
|
||||
shared_ptr[CTable] sp_table
|
||||
vector[c_string] c_names
|
||||
|
||||
for name in names:
|
||||
c_names.push_back(tobytes(name))
|
||||
with nogil:
|
||||
check_status(self.reader.get()
|
||||
.Read(c_names, &sp_table))
|
||||
|
||||
return pyarrow_wrap_table(sp_table)
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_flight.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_flight.cpython-39-darwin.so
Executable file
Binary file not shown.
3082
venv/lib/python3.9/site-packages/pyarrow/_flight.pyx
Normal file
3082
venv/lib/python3.9/site-packages/pyarrow/_flight.pyx
Normal file
File diff suppressed because it is too large
Load Diff
BIN
venv/lib/python3.9/site-packages/pyarrow/_fs.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_fs.cpython-39-darwin.so
Executable file
Binary file not shown.
94
venv/lib/python3.9/site-packages/pyarrow/_fs.pxd
Normal file
94
venv/lib/python3.9/site-packages/pyarrow/_fs.pxd
Normal file
@@ -0,0 +1,94 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow_fs cimport *
|
||||
from pyarrow.lib import _detect_compression, frombytes, tobytes
|
||||
from pyarrow.lib cimport *
|
||||
|
||||
|
||||
cpdef enum FileType:
|
||||
NotFound = <int8_t> CFileType_NotFound
|
||||
Unknown = <int8_t> CFileType_Unknown
|
||||
File = <int8_t> CFileType_File
|
||||
Directory = <int8_t> CFileType_Directory
|
||||
|
||||
|
||||
cdef class FileInfo(_Weakrefable):
|
||||
cdef:
|
||||
CFileInfo info
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(CFileInfo info)
|
||||
|
||||
cdef inline CFileInfo unwrap(self) nogil
|
||||
|
||||
@staticmethod
|
||||
cdef CFileInfo unwrap_safe(obj)
|
||||
|
||||
|
||||
cdef class FileSelector(_Weakrefable):
|
||||
cdef:
|
||||
CFileSelector selector
|
||||
|
||||
@staticmethod
|
||||
cdef FileSelector wrap(CFileSelector selector)
|
||||
|
||||
cdef inline CFileSelector unwrap(self) nogil
|
||||
|
||||
|
||||
cdef class FileSystem(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CFileSystem] wrapped
|
||||
CFileSystem* fs
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const shared_ptr[CFileSystem]& sp)
|
||||
|
||||
cdef inline shared_ptr[CFileSystem] unwrap(self) nogil
|
||||
|
||||
|
||||
cdef class LocalFileSystem(FileSystem):
|
||||
cdef:
|
||||
CLocalFileSystem* localfs
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
|
||||
|
||||
|
||||
cdef class SubTreeFileSystem(FileSystem):
|
||||
cdef:
|
||||
CSubTreeFileSystem* subtreefs
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
|
||||
|
||||
|
||||
cdef class _MockFileSystem(FileSystem):
|
||||
cdef:
|
||||
CMockFileSystem* mockfs
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
|
||||
|
||||
|
||||
cdef class PyFileSystem(FileSystem):
|
||||
cdef:
|
||||
CPyFileSystem* pyfs
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped)
|
||||
1623
venv/lib/python3.9/site-packages/pyarrow/_fs.pyx
Normal file
1623
venv/lib/python3.9/site-packages/pyarrow/_fs.pyx
Normal file
File diff suppressed because it is too large
Load Diff
BIN
venv/lib/python3.9/site-packages/pyarrow/_gcsfs.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_gcsfs.cpython-39-darwin.so
Executable file
Binary file not shown.
188
venv/lib/python3.9/site-packages/pyarrow/_gcsfs.pyx
Normal file
188
venv/lib/python3.9/site-packages/pyarrow/_gcsfs.pyx
Normal file
@@ -0,0 +1,188 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
|
||||
pyarrow_unwrap_metadata)
|
||||
from pyarrow.lib import frombytes, tobytes, KeyValueMetadata, ensure_metadata
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_fs cimport *
|
||||
from pyarrow._fs cimport FileSystem, TimePoint_to_ns, PyDateTime_to_TimePoint
|
||||
from cython.operator cimport dereference as deref
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
|
||||
cdef class GcsFileSystem(FileSystem):
|
||||
"""
|
||||
Google Cloud Storage (GCS) backed FileSystem implementation
|
||||
|
||||
By default uses the process described in https://google.aip.dev/auth/4110
|
||||
to resolve credentials. If not running on Google Cloud Platform (GCP),
|
||||
this generally requires the environment variable
|
||||
GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file
|
||||
containing credentials.
|
||||
|
||||
Note: GCS buckets are special and the operations available on them may be
|
||||
limited or more expensive than expected compared to local file systems.
|
||||
|
||||
Note: When pickling a GcsFileSystem that uses default credentials, resolution
|
||||
credentials are not stored in the serialized data. Therefore, when unpickling
|
||||
it is assumed that the necessary credentials are in place for the target
|
||||
process.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
anonymous : boolean, default False
|
||||
Whether to connect anonymously.
|
||||
If true, will not attempt to look up credentials using standard GCP
|
||||
configuration methods.
|
||||
access_token : str, default None
|
||||
GCP access token. If provided, temporary credentials will be fetched by
|
||||
assuming this role; also, a `credential_token_expiration` must be
|
||||
specified as well.
|
||||
target_service_account : str, default None
|
||||
An optional service account to try to impersonate when accessing GCS. This
|
||||
requires the specified credential user or service account to have the necessary
|
||||
permissions.
|
||||
credential_token_expiration : datetime, default None
|
||||
Expiration for credential generated with an access token. Must be specified
|
||||
if `access_token` is specified.
|
||||
default_bucket_location : str, default 'US'
|
||||
GCP region to create buckets in.
|
||||
scheme : str, default 'https'
|
||||
GCS connection transport scheme.
|
||||
endpoint_override : str, default None
|
||||
Override endpoint with a connect string such as "localhost:9000"
|
||||
default_metadata : mapping or pyarrow.KeyValueMetadata, default None
|
||||
Default metadata for `open_output_stream`. This will be ignored if
|
||||
non-empty metadata is passed to `open_output_stream`.
|
||||
retry_time_limit : timedelta, default None
|
||||
Set the maximum amount of time the GCS client will attempt to retry
|
||||
transient errors. Subsecond granularity is ignored.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CGcsFileSystem* gcsfs
|
||||
|
||||
def __init__(self, *, bint anonymous=False, access_token=None,
|
||||
target_service_account=None, credential_token_expiration=None,
|
||||
default_bucket_location='US',
|
||||
scheme=None,
|
||||
endpoint_override=None,
|
||||
default_metadata=None,
|
||||
retry_time_limit=None):
|
||||
cdef:
|
||||
CGcsOptions options
|
||||
shared_ptr[CGcsFileSystem] wrapped
|
||||
double time_limit_seconds
|
||||
|
||||
# Intentional use of truthiness because empty strings aren't valid and
|
||||
# for reconstruction from pickling will give empty strings.
|
||||
if anonymous and (target_service_account or access_token):
|
||||
raise ValueError(
|
||||
'anonymous option is not compatible with target_service_account and '
|
||||
'access_token'
|
||||
)
|
||||
elif bool(access_token) != bool(credential_token_expiration):
|
||||
raise ValueError(
|
||||
'access_token and credential_token_expiration must be '
|
||||
'specified together'
|
||||
)
|
||||
|
||||
elif anonymous:
|
||||
options = CGcsOptions.Anonymous()
|
||||
elif access_token:
|
||||
if not isinstance(credential_token_expiration, datetime):
|
||||
raise ValueError(
|
||||
"credential_token_expiration must be a datetime")
|
||||
options = CGcsOptions.FromAccessToken(
|
||||
tobytes(access_token),
|
||||
PyDateTime_to_TimePoint(<PyDateTime_DateTime*>credential_token_expiration))
|
||||
else:
|
||||
options = CGcsOptions.Defaults()
|
||||
|
||||
# Target service account requires base credentials so
|
||||
# it is not part of the if/else chain above which only
|
||||
# handles base credentials.
|
||||
if target_service_account:
|
||||
options = CGcsOptions.FromImpersonatedServiceAccount(
|
||||
options.credentials, tobytes(target_service_account))
|
||||
|
||||
options.default_bucket_location = tobytes(default_bucket_location)
|
||||
|
||||
if scheme is not None:
|
||||
options.scheme = tobytes(scheme)
|
||||
if endpoint_override is not None:
|
||||
options.endpoint_override = tobytes(endpoint_override)
|
||||
if default_metadata is not None:
|
||||
options.default_metadata = pyarrow_unwrap_metadata(
|
||||
ensure_metadata(default_metadata))
|
||||
if retry_time_limit is not None:
|
||||
time_limit_seconds = retry_time_limit.total_seconds()
|
||||
options.retry_limit_seconds = time_limit_seconds
|
||||
|
||||
with nogil:
|
||||
wrapped = GetResultValue(CGcsFileSystem.Make(options))
|
||||
|
||||
self.init(<shared_ptr[CFileSystem]> wrapped)
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
|
||||
FileSystem.init(self, wrapped)
|
||||
self.gcsfs = <CGcsFileSystem*> wrapped.get()
|
||||
|
||||
@classmethod
|
||||
def _reconstruct(cls, kwargs):
|
||||
return cls(**kwargs)
|
||||
|
||||
def _expiration_datetime_from_options(self):
|
||||
expiration_ns = TimePoint_to_ns(
|
||||
self.gcsfs.options().credentials.expiration())
|
||||
if expiration_ns == 0:
|
||||
return None
|
||||
return datetime.fromtimestamp(expiration_ns / 1.0e9, timezone.utc)
|
||||
|
||||
def __reduce__(self):
|
||||
cdef CGcsOptions opts = self.gcsfs.options()
|
||||
service_account = frombytes(opts.credentials.target_service_account())
|
||||
expiration_dt = self._expiration_datetime_from_options()
|
||||
retry_time_limit = None
|
||||
if opts.retry_limit_seconds.has_value():
|
||||
retry_time_limit = timedelta(
|
||||
seconds=opts.retry_limit_seconds.value())
|
||||
return (
|
||||
GcsFileSystem._reconstruct, (dict(
|
||||
access_token=frombytes(opts.credentials.access_token()),
|
||||
anonymous=opts.credentials.anonymous(),
|
||||
credential_token_expiration=expiration_dt,
|
||||
target_service_account=service_account,
|
||||
scheme=frombytes(opts.scheme),
|
||||
endpoint_override=frombytes(opts.endpoint_override),
|
||||
default_bucket_location=frombytes(
|
||||
opts.default_bucket_location),
|
||||
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
|
||||
retry_time_limit=retry_time_limit
|
||||
),))
|
||||
|
||||
@property
|
||||
def default_bucket_location(self):
|
||||
"""
|
||||
The GCP location this filesystem will write to.
|
||||
"""
|
||||
return frombytes(self.gcsfs.options().default_bucket_location)
|
||||
@@ -0,0 +1,4 @@
|
||||
# file generated by setuptools_scm
|
||||
# don't change, don't track in version control
|
||||
__version__ = version = '11.0.0'
|
||||
__version_tuple__ = version_tuple = (11, 0, 0)
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_hdfs.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_hdfs.cpython-39-darwin.so
Executable file
Binary file not shown.
156
venv/lib/python3.9/site-packages/pyarrow/_hdfs.pyx
Normal file
156
venv/lib/python3.9/site-packages/pyarrow/_hdfs.pyx
Normal file
@@ -0,0 +1,156 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.lib cimport check_status
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_fs cimport *
|
||||
from pyarrow._fs cimport FileSystem
|
||||
|
||||
from pyarrow.lib import frombytes, tobytes
|
||||
from pyarrow.util import _stringify_path
|
||||
|
||||
|
||||
cdef class HadoopFileSystem(FileSystem):
|
||||
"""
|
||||
HDFS backed FileSystem implementation
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host : str
|
||||
HDFS host to connect to. Set to "default" for fs.defaultFS from
|
||||
core-site.xml.
|
||||
port : int, default 8020
|
||||
HDFS port to connect to. Set to 0 for default or logical (HA) nodes.
|
||||
user : str, default None
|
||||
Username when connecting to HDFS; None implies login user.
|
||||
replication : int, default 3
|
||||
Number of copies each block will have.
|
||||
buffer_size : int, default 0
|
||||
If 0, no buffering will happen otherwise the size of the temporary read
|
||||
and write buffer.
|
||||
default_block_size : int, default None
|
||||
None means the default configuration for HDFS, a typical block size is
|
||||
128 MB.
|
||||
kerb_ticket : string or path, default None
|
||||
If not None, the path to the Kerberos ticket cache.
|
||||
extra_conf : dict, default None
|
||||
Extra key/value pairs for configuration; will override any
|
||||
hdfs-site.xml properties.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pyarrow import fs
|
||||
>>> hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path) # doctest: +SKIP
|
||||
|
||||
For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CHadoopFileSystem* hdfs
|
||||
|
||||
def __init__(self, str host, int port=8020, *, str user=None,
|
||||
int replication=3, int buffer_size=0,
|
||||
default_block_size=None, kerb_ticket=None,
|
||||
extra_conf=None):
|
||||
cdef:
|
||||
CHdfsOptions options
|
||||
shared_ptr[CHadoopFileSystem] wrapped
|
||||
|
||||
if not host.startswith(('hdfs://', 'viewfs://')) and host != "default":
|
||||
# TODO(kszucs): do more sanitization
|
||||
host = 'hdfs://{}'.format(host)
|
||||
|
||||
options.ConfigureEndPoint(tobytes(host), int(port))
|
||||
options.ConfigureReplication(replication)
|
||||
options.ConfigureBufferSize(buffer_size)
|
||||
|
||||
if user is not None:
|
||||
options.ConfigureUser(tobytes(user))
|
||||
if default_block_size is not None:
|
||||
options.ConfigureBlockSize(default_block_size)
|
||||
if kerb_ticket is not None:
|
||||
options.ConfigureKerberosTicketCachePath(
|
||||
tobytes(_stringify_path(kerb_ticket)))
|
||||
if extra_conf is not None:
|
||||
for k, v in extra_conf.items():
|
||||
options.ConfigureExtraConf(tobytes(k), tobytes(v))
|
||||
|
||||
with nogil:
|
||||
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
|
||||
self.init(<shared_ptr[CFileSystem]> wrapped)
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
|
||||
FileSystem.init(self, wrapped)
|
||||
self.hdfs = <CHadoopFileSystem*> wrapped.get()
|
||||
|
||||
@staticmethod
|
||||
def from_uri(uri):
|
||||
"""
|
||||
Instantiate HadoopFileSystem object from an URI string.
|
||||
|
||||
The following two calls are equivalent
|
||||
|
||||
* ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\
|
||||
&replication=1')``
|
||||
* ``HadoopFileSystem('localhost', port=8020, user='test', \
|
||||
replication=1)``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
uri : str
|
||||
A string URI describing the connection to HDFS.
|
||||
In order to change the user, replication, buffer_size or
|
||||
default_block_size pass the values as query parts.
|
||||
|
||||
Returns
|
||||
-------
|
||||
HadoopFileSystem
|
||||
"""
|
||||
cdef:
|
||||
HadoopFileSystem self = HadoopFileSystem.__new__(HadoopFileSystem)
|
||||
shared_ptr[CHadoopFileSystem] wrapped
|
||||
CHdfsOptions options
|
||||
|
||||
options = GetResultValue(CHdfsOptions.FromUriString(tobytes(uri)))
|
||||
with nogil:
|
||||
wrapped = GetResultValue(CHadoopFileSystem.Make(options))
|
||||
|
||||
self.init(<shared_ptr[CFileSystem]> wrapped)
|
||||
return self
|
||||
|
||||
@classmethod
|
||||
def _reconstruct(cls, kwargs):
|
||||
return cls(**kwargs)
|
||||
|
||||
def __reduce__(self):
|
||||
cdef CHdfsOptions opts = self.hdfs.options()
|
||||
return (
|
||||
HadoopFileSystem._reconstruct, (dict(
|
||||
host=frombytes(opts.connection_config.host),
|
||||
port=opts.connection_config.port,
|
||||
user=frombytes(opts.connection_config.user),
|
||||
replication=opts.replication,
|
||||
buffer_size=opts.buffer_size,
|
||||
default_block_size=opts.default_block_size,
|
||||
kerb_ticket=frombytes(opts.connection_config.kerb_ticket),
|
||||
extra_conf={frombytes(k): frombytes(v)
|
||||
for k, v in opts.connection_config.extra_conf},
|
||||
),)
|
||||
)
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_hdfsio.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_hdfsio.cpython-39-darwin.so
Executable file
Binary file not shown.
480
venv/lib/python3.9/site-packages/pyarrow/_hdfsio.pyx
Normal file
480
venv/lib/python3.9/site-packages/pyarrow/_hdfsio.pyx
Normal file
@@ -0,0 +1,480 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# ----------------------------------------------------------------------
|
||||
# HDFS IO implementation
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
import re
|
||||
|
||||
from pyarrow.lib cimport check_status, _Weakrefable, NativeFile
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_fs cimport *
|
||||
from pyarrow.lib import frombytes, tobytes, ArrowIOError
|
||||
|
||||
from queue import Queue, Empty as QueueEmpty, Full as QueueFull
|
||||
|
||||
|
||||
_HDFS_PATH_RE = re.compile(r'hdfs://(.*):(\d+)(.*)')
|
||||
|
||||
|
||||
def have_libhdfs():
|
||||
try:
|
||||
with nogil:
|
||||
check_status(HaveLibHdfs())
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def strip_hdfs_abspath(path):
|
||||
m = _HDFS_PATH_RE.match(path)
|
||||
if m:
|
||||
return m.group(3)
|
||||
else:
|
||||
return path
|
||||
|
||||
|
||||
cdef class HadoopFileSystem(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CIOHadoopFileSystem] client
|
||||
|
||||
cdef readonly:
|
||||
bint is_open
|
||||
object host
|
||||
object user
|
||||
object kerb_ticket
|
||||
int port
|
||||
dict extra_conf
|
||||
|
||||
def _connect(self, host, port, user, kerb_ticket, extra_conf):
|
||||
cdef HdfsConnectionConfig conf
|
||||
|
||||
if host is not None:
|
||||
conf.host = tobytes(host)
|
||||
self.host = host
|
||||
|
||||
conf.port = port
|
||||
self.port = port
|
||||
|
||||
if user is not None:
|
||||
conf.user = tobytes(user)
|
||||
self.user = user
|
||||
|
||||
if kerb_ticket is not None:
|
||||
conf.kerb_ticket = tobytes(kerb_ticket)
|
||||
self.kerb_ticket = kerb_ticket
|
||||
|
||||
with nogil:
|
||||
check_status(HaveLibHdfs())
|
||||
|
||||
if extra_conf is not None and isinstance(extra_conf, dict):
|
||||
conf.extra_conf = {tobytes(k): tobytes(v)
|
||||
for k, v in extra_conf.items()}
|
||||
self.extra_conf = extra_conf
|
||||
|
||||
with nogil:
|
||||
check_status(CIOHadoopFileSystem.Connect(&conf, &self.client))
|
||||
self.is_open = True
|
||||
|
||||
@classmethod
|
||||
def connect(cls, *args, **kwargs):
|
||||
return cls(*args, **kwargs)
|
||||
|
||||
def __dealloc__(self):
|
||||
if self.is_open:
|
||||
self.close()
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Disconnect from the HDFS cluster
|
||||
"""
|
||||
self._ensure_client()
|
||||
with nogil:
|
||||
check_status(self.client.get().Disconnect())
|
||||
self.is_open = False
|
||||
|
||||
cdef _ensure_client(self):
|
||||
if self.client.get() == NULL:
|
||||
raise IOError('HDFS client improperly initialized')
|
||||
elif not self.is_open:
|
||||
raise IOError('HDFS client is closed')
|
||||
|
||||
def exists(self, path):
|
||||
"""
|
||||
Returns True if the path is known to the cluster, False if it does not
|
||||
(or there is an RPC error)
|
||||
"""
|
||||
self._ensure_client()
|
||||
|
||||
cdef c_string c_path = tobytes(path)
|
||||
cdef c_bool result
|
||||
with nogil:
|
||||
result = self.client.get().Exists(c_path)
|
||||
return result
|
||||
|
||||
def isdir(self, path):
|
||||
cdef HdfsPathInfo info
|
||||
try:
|
||||
self._path_info(path, &info)
|
||||
except ArrowIOError:
|
||||
return False
|
||||
return info.kind == ObjectType_DIRECTORY
|
||||
|
||||
def isfile(self, path):
|
||||
cdef HdfsPathInfo info
|
||||
try:
|
||||
self._path_info(path, &info)
|
||||
except ArrowIOError:
|
||||
return False
|
||||
return info.kind == ObjectType_FILE
|
||||
|
||||
def get_capacity(self):
|
||||
"""
|
||||
Get reported total capacity of file system
|
||||
|
||||
Returns
|
||||
-------
|
||||
capacity : int
|
||||
"""
|
||||
cdef int64_t capacity = 0
|
||||
with nogil:
|
||||
check_status(self.client.get().GetCapacity(&capacity))
|
||||
return capacity
|
||||
|
||||
def get_space_used(self):
|
||||
"""
|
||||
Get space used on file system
|
||||
|
||||
Returns
|
||||
-------
|
||||
space_used : int
|
||||
"""
|
||||
cdef int64_t space_used = 0
|
||||
with nogil:
|
||||
check_status(self.client.get().GetUsed(&space_used))
|
||||
return space_used
|
||||
|
||||
def df(self):
|
||||
"""
|
||||
Return free space on disk, like the UNIX df command
|
||||
|
||||
Returns
|
||||
-------
|
||||
space : int
|
||||
"""
|
||||
return self.get_capacity() - self.get_space_used()
|
||||
|
||||
def rename(self, path, new_path):
|
||||
cdef c_string c_path = tobytes(path)
|
||||
cdef c_string c_new_path = tobytes(new_path)
|
||||
with nogil:
|
||||
check_status(self.client.get().Rename(c_path, c_new_path))
|
||||
|
||||
def info(self, path):
|
||||
"""
|
||||
Return detailed HDFS information for path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
Path to file or directory
|
||||
|
||||
Returns
|
||||
-------
|
||||
path_info : dict
|
||||
"""
|
||||
cdef HdfsPathInfo info
|
||||
self._path_info(path, &info)
|
||||
return {
|
||||
'path': frombytes(info.name),
|
||||
'owner': frombytes(info.owner),
|
||||
'group': frombytes(info.group),
|
||||
'size': info.size,
|
||||
'block_size': info.block_size,
|
||||
'last_modified': info.last_modified_time,
|
||||
'last_accessed': info.last_access_time,
|
||||
'replication': info.replication,
|
||||
'permissions': info.permissions,
|
||||
'kind': ('directory' if info.kind == ObjectType_DIRECTORY
|
||||
else 'file')
|
||||
}
|
||||
|
||||
def stat(self, path):
|
||||
"""
|
||||
Return basic file system statistics about path
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
Path to file or directory
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : dict
|
||||
"""
|
||||
cdef FileStatistics info
|
||||
cdef c_string c_path = tobytes(path)
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.Stat(c_path, &info))
|
||||
return {
|
||||
'size': info.size,
|
||||
'kind': ('directory' if info.kind == ObjectType_DIRECTORY
|
||||
else 'file')
|
||||
}
|
||||
|
||||
cdef _path_info(self, path, HdfsPathInfo* info):
|
||||
cdef c_string c_path = tobytes(path)
|
||||
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.GetPathInfo(c_path, info))
|
||||
|
||||
def ls(self, path, bint full_info):
|
||||
cdef:
|
||||
c_string c_path = tobytes(path)
|
||||
vector[HdfsPathInfo] listing
|
||||
list results = []
|
||||
int i
|
||||
|
||||
self._ensure_client()
|
||||
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.ListDirectory(c_path, &listing))
|
||||
|
||||
cdef const HdfsPathInfo* info
|
||||
for i in range(<int> listing.size()):
|
||||
info = &listing[i]
|
||||
|
||||
# Try to trim off the hdfs://HOST:PORT piece
|
||||
name = strip_hdfs_abspath(frombytes(info.name))
|
||||
|
||||
if full_info:
|
||||
kind = ('file' if info.kind == ObjectType_FILE
|
||||
else 'directory')
|
||||
|
||||
results.append({
|
||||
'kind': kind,
|
||||
'name': name,
|
||||
'owner': frombytes(info.owner),
|
||||
'group': frombytes(info.group),
|
||||
'last_modified_time': info.last_modified_time,
|
||||
'last_access_time': info.last_access_time,
|
||||
'size': info.size,
|
||||
'replication': info.replication,
|
||||
'block_size': info.block_size,
|
||||
'permissions': info.permissions
|
||||
})
|
||||
else:
|
||||
results.append(name)
|
||||
|
||||
return results
|
||||
|
||||
def chmod(self, path, mode):
|
||||
"""
|
||||
Change file permissions
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
absolute path to file or directory
|
||||
mode : int
|
||||
POSIX-like bitmask
|
||||
"""
|
||||
self._ensure_client()
|
||||
cdef c_string c_path = tobytes(path)
|
||||
cdef int c_mode = mode
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.Chmod(c_path, c_mode))
|
||||
|
||||
def chown(self, path, owner=None, group=None):
|
||||
"""
|
||||
Change file permissions
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
absolute path to file or directory
|
||||
owner : string, default None
|
||||
New owner, None for no change
|
||||
group : string, default None
|
||||
New group, None for no change
|
||||
"""
|
||||
cdef:
|
||||
c_string c_path
|
||||
c_string c_owner
|
||||
c_string c_group
|
||||
const char* c_owner_ptr = NULL
|
||||
const char* c_group_ptr = NULL
|
||||
|
||||
self._ensure_client()
|
||||
|
||||
c_path = tobytes(path)
|
||||
if owner is not None:
|
||||
c_owner = tobytes(owner)
|
||||
c_owner_ptr = c_owner.c_str()
|
||||
|
||||
if group is not None:
|
||||
c_group = tobytes(group)
|
||||
c_group_ptr = c_group.c_str()
|
||||
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.Chown(c_path, c_owner_ptr, c_group_ptr))
|
||||
|
||||
def mkdir(self, path):
|
||||
"""
|
||||
Create indicated directory and any necessary parent directories
|
||||
"""
|
||||
self._ensure_client()
|
||||
cdef c_string c_path = tobytes(path)
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.MakeDirectory(c_path))
|
||||
|
||||
def delete(self, path, bint recursive=False):
|
||||
"""
|
||||
Delete the indicated file or directory
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : string
|
||||
recursive : boolean, default False
|
||||
If True, also delete child paths for directories
|
||||
"""
|
||||
self._ensure_client()
|
||||
|
||||
cdef c_string c_path = tobytes(path)
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.Delete(c_path, recursive == 1))
|
||||
|
||||
def open(self, path, mode='rb', buffer_size=None, replication=None,
|
||||
default_block_size=None):
|
||||
"""
|
||||
Open HDFS file for reading or writing
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode : string
|
||||
Must be one of 'rb', 'wb', 'ab'
|
||||
|
||||
Returns
|
||||
-------
|
||||
handle : HdfsFile
|
||||
"""
|
||||
self._ensure_client()
|
||||
|
||||
cdef HdfsFile out = HdfsFile()
|
||||
|
||||
if mode not in ('rb', 'wb', 'ab'):
|
||||
raise Exception("Mode must be 'rb' (read), "
|
||||
"'wb' (write, new file), or 'ab' (append)")
|
||||
|
||||
cdef c_string c_path = tobytes(path)
|
||||
cdef c_bool append = False
|
||||
|
||||
# 0 in libhdfs means "use the default"
|
||||
cdef int32_t c_buffer_size = buffer_size or 0
|
||||
cdef int16_t c_replication = replication or 0
|
||||
cdef int64_t c_default_block_size = default_block_size or 0
|
||||
|
||||
cdef shared_ptr[HdfsOutputStream] wr_handle
|
||||
cdef shared_ptr[HdfsReadableFile] rd_handle
|
||||
|
||||
if mode in ('wb', 'ab'):
|
||||
if mode == 'ab':
|
||||
append = True
|
||||
|
||||
with nogil:
|
||||
check_status(
|
||||
self.client.get()
|
||||
.OpenWritable(c_path, append, c_buffer_size,
|
||||
c_replication, c_default_block_size,
|
||||
&wr_handle))
|
||||
|
||||
out.set_output_stream(<shared_ptr[COutputStream]> wr_handle)
|
||||
out.is_writable = True
|
||||
else:
|
||||
with nogil:
|
||||
check_status(self.client.get()
|
||||
.OpenReadable(c_path, &rd_handle))
|
||||
|
||||
out.set_random_access_file(
|
||||
<shared_ptr[CRandomAccessFile]> rd_handle)
|
||||
out.is_readable = True
|
||||
|
||||
assert not out.closed
|
||||
|
||||
if c_buffer_size == 0:
|
||||
c_buffer_size = 2 ** 16
|
||||
|
||||
out.mode = mode
|
||||
out.buffer_size = c_buffer_size
|
||||
out.parent = _HdfsFileNanny(self, out)
|
||||
out.own_file = True
|
||||
|
||||
return out
|
||||
|
||||
def download(self, path, stream, buffer_size=None):
|
||||
with self.open(path, 'rb') as f:
|
||||
f.download(stream, buffer_size=buffer_size)
|
||||
|
||||
def upload(self, path, stream, buffer_size=None):
|
||||
"""
|
||||
Upload file-like object to HDFS path
|
||||
"""
|
||||
with self.open(path, 'wb') as f:
|
||||
f.upload(stream, buffer_size=buffer_size)
|
||||
|
||||
|
||||
# ARROW-404: Helper class to ensure that files are closed before the
|
||||
# client. During deallocation of the extension class, the attributes are
|
||||
# decref'd which can cause the client to get closed first if the file has the
|
||||
# last remaining reference
|
||||
cdef class _HdfsFileNanny(_Weakrefable):
|
||||
cdef:
|
||||
object client
|
||||
object file_handle_ref
|
||||
|
||||
def __cinit__(self, client, file_handle):
|
||||
import weakref
|
||||
self.client = client
|
||||
self.file_handle_ref = weakref.ref(file_handle)
|
||||
|
||||
def __dealloc__(self):
|
||||
fh = self.file_handle_ref()
|
||||
if fh:
|
||||
fh.close()
|
||||
# avoid cyclic GC
|
||||
self.file_handle_ref = None
|
||||
self.client = None
|
||||
|
||||
|
||||
cdef class HdfsFile(NativeFile):
|
||||
cdef readonly:
|
||||
int32_t buffer_size
|
||||
object mode
|
||||
object parent
|
||||
|
||||
def __dealloc__(self):
|
||||
self.parent = None
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_json.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_json.cpython-39-darwin.so
Executable file
Binary file not shown.
261
venv/lib/python3.9/site-packages/pyarrow/_json.pyx
Normal file
261
venv/lib/python3.9/site-packages/pyarrow/_json.pyx
Normal file
@@ -0,0 +1,261 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport (check_status, _Weakrefable, Field, MemoryPool,
|
||||
ensure_type, maybe_unbox_memory_pool,
|
||||
get_input_stream, pyarrow_wrap_table,
|
||||
pyarrow_wrap_data_type, pyarrow_unwrap_data_type,
|
||||
pyarrow_wrap_schema, pyarrow_unwrap_schema)
|
||||
|
||||
|
||||
cdef class ReadOptions(_Weakrefable):
|
||||
"""
|
||||
Options for reading JSON files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
use_threads : bool, optional (default True)
|
||||
Whether to use multiple threads to accelerate reading
|
||||
block_size : int, optional
|
||||
How much bytes to process at a time from the input stream.
|
||||
This will determine multi-threading granularity as well as
|
||||
the size of individual chunks in the Table.
|
||||
"""
|
||||
cdef:
|
||||
CJSONReadOptions options
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, use_threads=None, block_size=None):
|
||||
self.options = CJSONReadOptions.Defaults()
|
||||
if use_threads is not None:
|
||||
self.use_threads = use_threads
|
||||
if block_size is not None:
|
||||
self.block_size = block_size
|
||||
|
||||
@property
|
||||
def use_threads(self):
|
||||
"""
|
||||
Whether to use multiple threads to accelerate reading.
|
||||
"""
|
||||
return self.options.use_threads
|
||||
|
||||
@use_threads.setter
|
||||
def use_threads(self, value):
|
||||
self.options.use_threads = value
|
||||
|
||||
@property
|
||||
def block_size(self):
|
||||
"""
|
||||
How much bytes to process at a time from the input stream.
|
||||
|
||||
This will determine multi-threading granularity as well as the size of
|
||||
individual chunks in the Table.
|
||||
"""
|
||||
return self.options.block_size
|
||||
|
||||
@block_size.setter
|
||||
def block_size(self, value):
|
||||
self.options.block_size = value
|
||||
|
||||
def __reduce__(self):
|
||||
return ReadOptions, (
|
||||
self.use_threads,
|
||||
self.block_size
|
||||
)
|
||||
|
||||
|
||||
cdef class ParseOptions(_Weakrefable):
|
||||
"""
|
||||
Options for parsing JSON files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
explicit_schema : Schema, optional (default None)
|
||||
Optional explicit schema (no type inference, ignores other fields).
|
||||
newlines_in_values : bool, optional (default False)
|
||||
Whether objects may be printed across multiple lines (for example
|
||||
pretty printed). If false, input must end with an empty line.
|
||||
unexpected_field_behavior : str, default "infer"
|
||||
How JSON fields outside of explicit_schema (if given) are treated.
|
||||
|
||||
Possible behaviors:
|
||||
|
||||
- "ignore": unexpected JSON fields are ignored
|
||||
- "error": error out on unexpected JSON fields
|
||||
- "infer": unexpected JSON fields are type-inferred and included in
|
||||
the output
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CJSONParseOptions options
|
||||
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, explicit_schema=None, newlines_in_values=None,
|
||||
unexpected_field_behavior=None):
|
||||
self.options = CJSONParseOptions.Defaults()
|
||||
if explicit_schema is not None:
|
||||
self.explicit_schema = explicit_schema
|
||||
if newlines_in_values is not None:
|
||||
self.newlines_in_values = newlines_in_values
|
||||
if unexpected_field_behavior is not None:
|
||||
self.unexpected_field_behavior = unexpected_field_behavior
|
||||
|
||||
def __reduce__(self):
|
||||
return ParseOptions, (
|
||||
self.explicit_schema,
|
||||
self.newlines_in_values,
|
||||
self.unexpected_field_behavior
|
||||
)
|
||||
|
||||
@property
|
||||
def explicit_schema(self):
|
||||
"""
|
||||
Optional explicit schema (no type inference, ignores other fields)
|
||||
"""
|
||||
if self.options.explicit_schema.get() == NULL:
|
||||
return None
|
||||
else:
|
||||
return pyarrow_wrap_schema(self.options.explicit_schema)
|
||||
|
||||
@explicit_schema.setter
|
||||
def explicit_schema(self, value):
|
||||
self.options.explicit_schema = pyarrow_unwrap_schema(value)
|
||||
|
||||
@property
|
||||
def newlines_in_values(self):
|
||||
"""
|
||||
Whether newline characters are allowed in JSON values.
|
||||
Setting this to True reduces the performance of multi-threaded
|
||||
JSON reading.
|
||||
"""
|
||||
return self.options.newlines_in_values
|
||||
|
||||
@newlines_in_values.setter
|
||||
def newlines_in_values(self, value):
|
||||
self.options.newlines_in_values = value
|
||||
|
||||
@property
|
||||
def unexpected_field_behavior(self):
|
||||
"""
|
||||
How JSON fields outside of explicit_schema (if given) are treated.
|
||||
|
||||
Possible behaviors:
|
||||
|
||||
- "ignore": unexpected JSON fields are ignored
|
||||
- "error": error out on unexpected JSON fields
|
||||
- "infer": unexpected JSON fields are type-inferred and included in
|
||||
the output
|
||||
|
||||
Set to "infer" by default.
|
||||
"""
|
||||
v = self.options.unexpected_field_behavior
|
||||
if v == CUnexpectedFieldBehavior_Ignore:
|
||||
return "ignore"
|
||||
elif v == CUnexpectedFieldBehavior_Error:
|
||||
return "error"
|
||||
elif v == CUnexpectedFieldBehavior_InferType:
|
||||
return "infer"
|
||||
else:
|
||||
raise ValueError('Unexpected value for unexpected_field_behavior')
|
||||
|
||||
@unexpected_field_behavior.setter
|
||||
def unexpected_field_behavior(self, value):
|
||||
cdef CUnexpectedFieldBehavior v
|
||||
|
||||
if value == "ignore":
|
||||
v = CUnexpectedFieldBehavior_Ignore
|
||||
elif value == "error":
|
||||
v = CUnexpectedFieldBehavior_Error
|
||||
elif value == "infer":
|
||||
v = CUnexpectedFieldBehavior_InferType
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unexpected value `{}` for `unexpected_field_behavior`, pass "
|
||||
"either `ignore`, `error` or `infer`.".format(value)
|
||||
)
|
||||
|
||||
self.options.unexpected_field_behavior = v
|
||||
|
||||
|
||||
cdef _get_reader(input_file, shared_ptr[CInputStream]* out):
|
||||
use_memory_map = False
|
||||
get_input_stream(input_file, use_memory_map, out)
|
||||
|
||||
cdef _get_read_options(ReadOptions read_options, CJSONReadOptions* out):
|
||||
if read_options is None:
|
||||
out[0] = CJSONReadOptions.Defaults()
|
||||
else:
|
||||
out[0] = read_options.options
|
||||
|
||||
cdef _get_parse_options(ParseOptions parse_options, CJSONParseOptions* out):
|
||||
if parse_options is None:
|
||||
out[0] = CJSONParseOptions.Defaults()
|
||||
else:
|
||||
out[0] = parse_options.options
|
||||
|
||||
|
||||
def read_json(input_file, read_options=None, parse_options=None,
|
||||
MemoryPool memory_pool=None):
|
||||
"""
|
||||
Read a Table from a stream of JSON data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_file : str, path or file-like object
|
||||
The location of JSON data. Currently only the line-delimited JSON
|
||||
format is supported.
|
||||
read_options : pyarrow.json.ReadOptions, optional
|
||||
Options for the JSON reader (see ReadOptions constructor for defaults).
|
||||
parse_options : pyarrow.json.ParseOptions, optional
|
||||
Options for the JSON parser
|
||||
(see ParseOptions constructor for defaults).
|
||||
memory_pool : MemoryPool, optional
|
||||
Pool to allocate Table memory from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
:class:`pyarrow.Table`
|
||||
Contents of the JSON file as a in-memory table.
|
||||
"""
|
||||
cdef:
|
||||
shared_ptr[CInputStream] stream
|
||||
CJSONReadOptions c_read_options
|
||||
CJSONParseOptions c_parse_options
|
||||
shared_ptr[CJSONReader] reader
|
||||
shared_ptr[CTable] table
|
||||
|
||||
_get_reader(input_file, &stream)
|
||||
_get_read_options(read_options, &c_read_options)
|
||||
_get_parse_options(parse_options, &c_parse_options)
|
||||
|
||||
reader = GetResultValue(
|
||||
CJSONReader.Make(maybe_unbox_memory_pool(memory_pool),
|
||||
stream, c_read_options, c_parse_options))
|
||||
|
||||
with nogil:
|
||||
table = GetResultValue(reader.get().Read())
|
||||
|
||||
return pyarrow_wrap_table(table)
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_orc.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_orc.cpython-39-darwin.so
Executable file
Binary file not shown.
134
venv/lib/python3.9/site-packages/pyarrow/_orc.pxd
Normal file
134
venv/lib/python3.9/site-packages/pyarrow/_orc.pxd
Normal file
@@ -0,0 +1,134 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from libcpp cimport bool as c_bool
|
||||
from libc.string cimport const_char
|
||||
from libcpp.vector cimport vector as std_vector
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport (CArray, CSchema, CStatus,
|
||||
CResult, CTable, CMemoryPool,
|
||||
CKeyValueMetadata,
|
||||
CRecordBatch,
|
||||
CTable, CCompressionType,
|
||||
CRandomAccessFile, COutputStream,
|
||||
TimeUnit)
|
||||
|
||||
cdef extern from "arrow/adapters/orc/options.h" \
|
||||
namespace "arrow::adapters::orc" nogil:
|
||||
cdef enum CompressionStrategy \
|
||||
" arrow::adapters::orc::CompressionStrategy":
|
||||
_CompressionStrategy_SPEED \
|
||||
" arrow::adapters::orc::CompressionStrategy::kSpeed"
|
||||
_CompressionStrategy_COMPRESSION \
|
||||
" arrow::adapters::orc::CompressionStrategy::kCompression"
|
||||
|
||||
cdef enum WriterId" arrow::adapters::orc::WriterId":
|
||||
_WriterId_ORC_JAVA_WRITER" arrow::adapters::orc::WriterId::kOrcJava"
|
||||
_WriterId_ORC_CPP_WRITER" arrow::adapters::orc::WriterId::kOrcCpp"
|
||||
_WriterId_PRESTO_WRITER" arrow::adapters::orc::WriterId::kPresto"
|
||||
_WriterId_SCRITCHLEY_GO \
|
||||
" arrow::adapters::orc::WriterId::kScritchleyGo"
|
||||
_WriterId_TRINO_WRITER" arrow::adapters::orc::WriterId::kTrino"
|
||||
_WriterId_UNKNOWN_WRITER" arrow::adapters::orc::WriterId::kUnknown"
|
||||
|
||||
cdef enum WriterVersion" arrow::adapters::orc::WriterVersion":
|
||||
_WriterVersion_ORIGINAL \
|
||||
" arrow::adapters::orc::WriterVersion::kOriginal"
|
||||
_WriterVersion_HIVE_8732 \
|
||||
" arrow::adapters::orc::WriterVersion::kHive8732"
|
||||
_WriterVersion_HIVE_4243 \
|
||||
" arrow::adapters::orc::WriterVersion::kHive4243"
|
||||
_WriterVersion_HIVE_12055 \
|
||||
" arrow::adapters::orc::WriterVersion::kHive12055"
|
||||
_WriterVersion_HIVE_13083 \
|
||||
" arrow::adapters::orc::WriterVersion::kHive13083"
|
||||
_WriterVersion_ORC_101" arrow::adapters::orc::WriterVersion::kOrc101"
|
||||
_WriterVersion_ORC_135" arrow::adapters::orc::WriterVersion::kOrc135"
|
||||
_WriterVersion_ORC_517" arrow::adapters::orc::WriterVersion::kOrc517"
|
||||
_WriterVersion_ORC_203" arrow::adapters::orc::WriterVersion::kOrc203"
|
||||
_WriterVersion_ORC_14" arrow::adapters::orc::WriterVersion::kOrc14"
|
||||
_WriterVersion_MAX" arrow::adapters::orc::WriterVersion::kMax"
|
||||
|
||||
cdef cppclass FileVersion" arrow::adapters::orc::FileVersion":
|
||||
FileVersion(uint32_t major_version, uint32_t minor_version)
|
||||
uint32_t major_version()
|
||||
uint32_t minor_version()
|
||||
c_string ToString()
|
||||
|
||||
cdef struct WriteOptions" arrow::adapters::orc::WriteOptions":
|
||||
int64_t batch_size
|
||||
FileVersion file_version
|
||||
int64_t stripe_size
|
||||
CCompressionType compression
|
||||
int64_t compression_block_size
|
||||
CompressionStrategy compression_strategy
|
||||
int64_t row_index_stride
|
||||
double padding_tolerance
|
||||
double dictionary_key_size_threshold
|
||||
std_vector[int64_t] bloom_filter_columns
|
||||
double bloom_filter_fpp
|
||||
|
||||
|
||||
cdef extern from "arrow/adapters/orc/adapter.h" \
|
||||
namespace "arrow::adapters::orc" nogil:
|
||||
|
||||
cdef cppclass ORCFileReader:
|
||||
@staticmethod
|
||||
CResult[unique_ptr[ORCFileReader]] Open(
|
||||
const shared_ptr[CRandomAccessFile]& file,
|
||||
CMemoryPool* pool)
|
||||
|
||||
CResult[shared_ptr[const CKeyValueMetadata]] ReadMetadata()
|
||||
|
||||
CResult[shared_ptr[CSchema]] ReadSchema()
|
||||
|
||||
CResult[shared_ptr[CRecordBatch]] ReadStripe(int64_t stripe)
|
||||
CResult[shared_ptr[CRecordBatch]] ReadStripe(
|
||||
int64_t stripe, std_vector[c_string])
|
||||
|
||||
CResult[shared_ptr[CTable]] Read()
|
||||
CResult[shared_ptr[CTable]] Read(std_vector[c_string])
|
||||
|
||||
int64_t NumberOfStripes()
|
||||
int64_t NumberOfRows()
|
||||
FileVersion GetFileVersion()
|
||||
c_string GetSoftwareVersion()
|
||||
CResult[CCompressionType] GetCompression()
|
||||
int64_t GetCompressionSize()
|
||||
int64_t GetRowIndexStride()
|
||||
WriterId GetWriterId()
|
||||
int32_t GetWriterIdValue()
|
||||
WriterVersion GetWriterVersion()
|
||||
int64_t GetNumberOfStripeStatistics()
|
||||
int64_t GetContentLength()
|
||||
int64_t GetStripeStatisticsLength()
|
||||
int64_t GetFileFooterLength()
|
||||
int64_t GetFilePostscriptLength()
|
||||
int64_t GetFileLength()
|
||||
c_string GetSerializedFileTail()
|
||||
|
||||
cdef cppclass ORCFileWriter:
|
||||
@staticmethod
|
||||
CResult[unique_ptr[ORCFileWriter]] Open(
|
||||
COutputStream* output_stream, const WriteOptions& writer_options)
|
||||
|
||||
CStatus Write(const CTable& table)
|
||||
|
||||
CStatus Close()
|
||||
449
venv/lib/python3.9/site-packages/pyarrow/_orc.pyx
Normal file
449
venv/lib/python3.9/site-packages/pyarrow/_orc.pyx
Normal file
@@ -0,0 +1,449 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.vector cimport vector as std_vector
|
||||
from libcpp.utility cimport move
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport (check_status, _Weakrefable,
|
||||
MemoryPool, maybe_unbox_memory_pool,
|
||||
Schema, pyarrow_wrap_schema,
|
||||
KeyValueMetadata,
|
||||
pyarrow_wrap_batch,
|
||||
RecordBatch,
|
||||
Table,
|
||||
pyarrow_wrap_table,
|
||||
pyarrow_unwrap_schema,
|
||||
pyarrow_wrap_metadata,
|
||||
pyarrow_unwrap_table,
|
||||
get_reader,
|
||||
get_writer)
|
||||
from pyarrow.lib import frombytes, tobytes
|
||||
from pyarrow.util import _stringify_path
|
||||
|
||||
|
||||
cdef compression_type_from_enum(CCompressionType compression_type):
|
||||
compression_map = {
|
||||
CCompressionType_UNCOMPRESSED: 'UNCOMPRESSED',
|
||||
CCompressionType_GZIP: 'ZLIB',
|
||||
CCompressionType_SNAPPY: 'SNAPPY',
|
||||
CCompressionType_LZ4: 'LZ4',
|
||||
CCompressionType_ZSTD: 'ZSTD',
|
||||
}
|
||||
if compression_type in compression_map:
|
||||
return compression_map[compression_type]
|
||||
raise ValueError('Unsupported compression')
|
||||
|
||||
|
||||
cdef CCompressionType compression_type_from_name(name) except *:
|
||||
if not isinstance(name, str):
|
||||
raise TypeError('compression must be a string')
|
||||
name = name.upper()
|
||||
if name == 'ZLIB':
|
||||
return CCompressionType_GZIP
|
||||
elif name == 'SNAPPY':
|
||||
return CCompressionType_SNAPPY
|
||||
elif name == 'LZ4':
|
||||
return CCompressionType_LZ4
|
||||
elif name == 'ZSTD':
|
||||
return CCompressionType_ZSTD
|
||||
elif name == 'UNCOMPRESSED':
|
||||
return CCompressionType_UNCOMPRESSED
|
||||
raise ValueError(f'Unknown CompressionKind: {name}')
|
||||
|
||||
|
||||
cdef compression_strategy_from_enum(
|
||||
CompressionStrategy compression_strategy
|
||||
):
|
||||
compression_strategy_map = {
|
||||
_CompressionStrategy_SPEED: 'SPEED',
|
||||
_CompressionStrategy_COMPRESSION: 'COMPRESSION',
|
||||
}
|
||||
if compression_strategy in compression_strategy_map:
|
||||
return compression_strategy_map[compression_strategy]
|
||||
raise ValueError('Unsupported compression strategy')
|
||||
|
||||
|
||||
cdef CompressionStrategy compression_strategy_from_name(name) except *:
|
||||
if not isinstance(name, str):
|
||||
raise TypeError('compression strategy must be a string')
|
||||
name = name.upper()
|
||||
if name == 'COMPRESSION':
|
||||
return _CompressionStrategy_COMPRESSION
|
||||
elif name == 'SPEED':
|
||||
return _CompressionStrategy_SPEED
|
||||
raise ValueError(f'Unknown CompressionStrategy: {name}')
|
||||
|
||||
|
||||
cdef file_version_from_class(FileVersion file_version):
|
||||
return frombytes(file_version.ToString())
|
||||
|
||||
|
||||
cdef writer_id_from_enum(WriterId writer_id):
|
||||
writer_id_map = {
|
||||
_WriterId_ORC_JAVA_WRITER: 'ORC_JAVA',
|
||||
_WriterId_ORC_CPP_WRITER: 'ORC_CPP',
|
||||
_WriterId_PRESTO_WRITER: 'PRESTO',
|
||||
_WriterId_SCRITCHLEY_GO: 'SCRITCHLEY_GO',
|
||||
_WriterId_TRINO_WRITER: 'TRINO',
|
||||
}
|
||||
if writer_id in writer_id_map:
|
||||
return writer_id_map[writer_id]
|
||||
raise ValueError('Unsupported writer ID')
|
||||
|
||||
|
||||
cdef writer_version_from_enum(WriterVersion writer_version):
|
||||
writer_version_map = {
|
||||
_WriterVersion_ORIGINAL: 'ORIGINAL',
|
||||
_WriterVersion_HIVE_8732: 'HIVE_8732',
|
||||
_WriterVersion_HIVE_4243: 'HIVE_4243',
|
||||
_WriterVersion_HIVE_12055: 'HIVE_12055',
|
||||
_WriterVersion_HIVE_13083: 'HIVE_13083',
|
||||
_WriterVersion_ORC_101: 'ORC_101',
|
||||
_WriterVersion_ORC_135: 'ORC_135',
|
||||
_WriterVersion_ORC_517: 'ORC_517',
|
||||
_WriterVersion_ORC_203: 'ORC_203',
|
||||
_WriterVersion_ORC_14: 'ORC_14',
|
||||
}
|
||||
if writer_version in writer_version_map:
|
||||
return writer_version_map[writer_version]
|
||||
raise ValueError('Unsupported writer version')
|
||||
|
||||
|
||||
cdef shared_ptr[WriteOptions] _create_write_options(
|
||||
file_version=None,
|
||||
batch_size=None,
|
||||
stripe_size=None,
|
||||
compression=None,
|
||||
compression_block_size=None,
|
||||
compression_strategy=None,
|
||||
row_index_stride=None,
|
||||
padding_tolerance=None,
|
||||
dictionary_key_size_threshold=None,
|
||||
bloom_filter_columns=None,
|
||||
bloom_filter_fpp=None
|
||||
) except *:
|
||||
"""General writer options"""
|
||||
cdef:
|
||||
shared_ptr[WriteOptions] options
|
||||
options = make_shared[WriteOptions]()
|
||||
# batch_size
|
||||
if batch_size is not None:
|
||||
if isinstance(batch_size, int) and batch_size > 0:
|
||||
deref(options).batch_size = batch_size
|
||||
else:
|
||||
raise ValueError(f"Invalid ORC writer batch size: {batch_size}")
|
||||
# file_version
|
||||
if file_version is not None:
|
||||
if file_version == "0.12":
|
||||
deref(options).file_version = FileVersion(0, 12)
|
||||
elif file_version == "0.11":
|
||||
deref(options).file_version = FileVersion(0, 11)
|
||||
else:
|
||||
raise ValueError(f"Unsupported ORC file version: {file_version}")
|
||||
# stripe_size
|
||||
if stripe_size is not None:
|
||||
if isinstance(stripe_size, int) and stripe_size > 0:
|
||||
deref(options).stripe_size = stripe_size
|
||||
else:
|
||||
raise ValueError(f"Invalid ORC stripe size: {stripe_size}")
|
||||
# compression
|
||||
if compression is not None:
|
||||
if isinstance(compression, str):
|
||||
deref(options).compression = compression_type_from_name(
|
||||
compression)
|
||||
else:
|
||||
raise TypeError("Unsupported ORC compression type: "
|
||||
f"{compression}")
|
||||
# compression_block_size
|
||||
if compression_block_size is not None:
|
||||
if (isinstance(compression_block_size, int) and
|
||||
compression_block_size > 0):
|
||||
deref(options).compression_block_size = compression_block_size
|
||||
else:
|
||||
raise ValueError("Invalid ORC compression block size: "
|
||||
f"{compression_block_size}")
|
||||
# compression_strategy
|
||||
if compression_strategy is not None:
|
||||
if isinstance(compression, str):
|
||||
deref(options).compression_strategy = \
|
||||
compression_strategy_from_name(compression_strategy)
|
||||
else:
|
||||
raise TypeError("Unsupported ORC compression strategy: "
|
||||
f"{compression_strategy}")
|
||||
# row_index_stride
|
||||
if row_index_stride is not None:
|
||||
if isinstance(row_index_stride, int) and row_index_stride > 0:
|
||||
deref(options).row_index_stride = row_index_stride
|
||||
else:
|
||||
raise ValueError("Invalid ORC row index stride: "
|
||||
f"{row_index_stride}")
|
||||
# padding_tolerance
|
||||
if padding_tolerance is not None:
|
||||
try:
|
||||
padding_tolerance = float(padding_tolerance)
|
||||
deref(options).padding_tolerance = padding_tolerance
|
||||
except Exception:
|
||||
raise ValueError("Invalid ORC padding tolerance: "
|
||||
f"{padding_tolerance}")
|
||||
# dictionary_key_size_threshold
|
||||
if dictionary_key_size_threshold is not None:
|
||||
try:
|
||||
dictionary_key_size_threshold = float(
|
||||
dictionary_key_size_threshold)
|
||||
assert 0 <= dictionary_key_size_threshold <= 1
|
||||
deref(options).dictionary_key_size_threshold = \
|
||||
dictionary_key_size_threshold
|
||||
except Exception:
|
||||
raise ValueError("Invalid ORC dictionary key size threshold: "
|
||||
f"{dictionary_key_size_threshold}")
|
||||
# bloom_filter_columns
|
||||
if bloom_filter_columns is not None:
|
||||
try:
|
||||
bloom_filter_columns = list(bloom_filter_columns)
|
||||
for col in bloom_filter_columns:
|
||||
assert isinstance(col, int) and col >= 0
|
||||
deref(options).bloom_filter_columns = bloom_filter_columns
|
||||
except Exception:
|
||||
raise ValueError("Invalid ORC BloomFilter columns: "
|
||||
f"{bloom_filter_columns}")
|
||||
# Max false positive rate of the Bloom Filter
|
||||
if bloom_filter_fpp is not None:
|
||||
try:
|
||||
bloom_filter_fpp = float(bloom_filter_fpp)
|
||||
assert 0 <= bloom_filter_fpp <= 1
|
||||
deref(options).bloom_filter_fpp = bloom_filter_fpp
|
||||
except Exception:
|
||||
raise ValueError("Invalid ORC BloomFilter false positive rate: "
|
||||
f"{bloom_filter_fpp}")
|
||||
return options
|
||||
|
||||
|
||||
cdef class ORCReader(_Weakrefable):
|
||||
cdef:
|
||||
object source
|
||||
CMemoryPool* allocator
|
||||
unique_ptr[ORCFileReader] reader
|
||||
|
||||
def __cinit__(self, MemoryPool memory_pool=None):
|
||||
self.allocator = maybe_unbox_memory_pool(memory_pool)
|
||||
|
||||
def open(self, object source, c_bool use_memory_map=True):
|
||||
cdef:
|
||||
shared_ptr[CRandomAccessFile] rd_handle
|
||||
|
||||
self.source = source
|
||||
|
||||
get_reader(source, use_memory_map, &rd_handle)
|
||||
with nogil:
|
||||
self.reader = move(GetResultValue(
|
||||
ORCFileReader.Open(rd_handle, self.allocator)
|
||||
))
|
||||
|
||||
def metadata(self):
|
||||
"""
|
||||
The arrow metadata for this file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
metadata : pyarrow.KeyValueMetadata
|
||||
"""
|
||||
cdef:
|
||||
shared_ptr[const CKeyValueMetadata] sp_arrow_metadata
|
||||
|
||||
with nogil:
|
||||
sp_arrow_metadata = GetResultValue(
|
||||
deref(self.reader).ReadMetadata()
|
||||
)
|
||||
|
||||
return pyarrow_wrap_metadata(sp_arrow_metadata)
|
||||
|
||||
def schema(self):
|
||||
"""
|
||||
The arrow schema for this file.
|
||||
|
||||
Returns
|
||||
-------
|
||||
schema : pyarrow.Schema
|
||||
"""
|
||||
cdef:
|
||||
shared_ptr[CSchema] sp_arrow_schema
|
||||
|
||||
with nogil:
|
||||
sp_arrow_schema = GetResultValue(deref(self.reader).ReadSchema())
|
||||
|
||||
return pyarrow_wrap_schema(sp_arrow_schema)
|
||||
|
||||
def nrows(self):
|
||||
return deref(self.reader).NumberOfRows()
|
||||
|
||||
def nstripes(self):
|
||||
return deref(self.reader).NumberOfStripes()
|
||||
|
||||
def file_version(self):
|
||||
return file_version_from_class(deref(self.reader).GetFileVersion())
|
||||
|
||||
def software_version(self):
|
||||
return frombytes(deref(self.reader).GetSoftwareVersion())
|
||||
|
||||
def compression(self):
|
||||
return compression_type_from_enum(
|
||||
GetResultValue(deref(self.reader).GetCompression()))
|
||||
|
||||
def compression_size(self):
|
||||
return deref(self.reader).GetCompressionSize()
|
||||
|
||||
def row_index_stride(self):
|
||||
return deref(self.reader).GetRowIndexStride()
|
||||
|
||||
def writer(self):
|
||||
writer_name = writer_id_from_enum(deref(self.reader).GetWriterId())
|
||||
if writer_name == 'UNKNOWN':
|
||||
return deref(self.reader).GetWriterIdValue()
|
||||
else:
|
||||
return writer_name
|
||||
|
||||
def writer_version(self):
|
||||
return writer_version_from_enum(deref(self.reader).GetWriterVersion())
|
||||
|
||||
def nstripe_statistics(self):
|
||||
return deref(self.reader).GetNumberOfStripeStatistics()
|
||||
|
||||
def content_length(self):
|
||||
return deref(self.reader).GetContentLength()
|
||||
|
||||
def stripe_statistics_length(self):
|
||||
return deref(self.reader).GetStripeStatisticsLength()
|
||||
|
||||
def file_footer_length(self):
|
||||
return deref(self.reader).GetFileFooterLength()
|
||||
|
||||
def file_postscript_length(self):
|
||||
return deref(self.reader).GetFilePostscriptLength()
|
||||
|
||||
def file_length(self):
|
||||
return deref(self.reader).GetFileLength()
|
||||
|
||||
def serialized_file_tail(self):
|
||||
return deref(self.reader).GetSerializedFileTail()
|
||||
|
||||
def read_stripe(self, n, columns=None):
|
||||
cdef:
|
||||
shared_ptr[CRecordBatch] sp_record_batch
|
||||
RecordBatch batch
|
||||
int64_t stripe
|
||||
std_vector[c_string] c_names
|
||||
|
||||
stripe = n
|
||||
|
||||
if columns is None:
|
||||
with nogil:
|
||||
sp_record_batch = GetResultValue(
|
||||
deref(self.reader).ReadStripe(stripe)
|
||||
)
|
||||
else:
|
||||
c_names = [tobytes(name) for name in columns]
|
||||
with nogil:
|
||||
sp_record_batch = GetResultValue(
|
||||
deref(self.reader).ReadStripe(stripe, c_names)
|
||||
)
|
||||
|
||||
return pyarrow_wrap_batch(sp_record_batch)
|
||||
|
||||
def read(self, columns=None):
|
||||
cdef:
|
||||
shared_ptr[CTable] sp_table
|
||||
std_vector[c_string] c_names
|
||||
|
||||
if columns is None:
|
||||
with nogil:
|
||||
sp_table = GetResultValue(deref(self.reader).Read())
|
||||
else:
|
||||
c_names = [tobytes(name) for name in columns]
|
||||
with nogil:
|
||||
sp_table = GetResultValue(deref(self.reader).Read(c_names))
|
||||
|
||||
return pyarrow_wrap_table(sp_table)
|
||||
|
||||
|
||||
cdef class ORCWriter(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[ORCFileWriter] writer
|
||||
shared_ptr[COutputStream] sink
|
||||
c_bool own_sink
|
||||
|
||||
def open(self, object where, *,
|
||||
file_version=None,
|
||||
batch_size=None,
|
||||
stripe_size=None,
|
||||
compression=None,
|
||||
compression_block_size=None,
|
||||
compression_strategy=None,
|
||||
row_index_stride=None,
|
||||
padding_tolerance=None,
|
||||
dictionary_key_size_threshold=None,
|
||||
bloom_filter_columns=None,
|
||||
bloom_filter_fpp=None):
|
||||
cdef:
|
||||
shared_ptr[WriteOptions] write_options
|
||||
c_string c_where
|
||||
try:
|
||||
where = _stringify_path(where)
|
||||
except TypeError:
|
||||
get_writer(where, &self.sink)
|
||||
self.own_sink = False
|
||||
else:
|
||||
c_where = tobytes(where)
|
||||
with nogil:
|
||||
self.sink = GetResultValue(FileOutputStream.Open(c_where))
|
||||
self.own_sink = True
|
||||
|
||||
write_options = _create_write_options(
|
||||
file_version=file_version,
|
||||
batch_size=batch_size,
|
||||
stripe_size=stripe_size,
|
||||
compression=compression,
|
||||
compression_block_size=compression_block_size,
|
||||
compression_strategy=compression_strategy,
|
||||
row_index_stride=row_index_stride,
|
||||
padding_tolerance=padding_tolerance,
|
||||
dictionary_key_size_threshold=dictionary_key_size_threshold,
|
||||
bloom_filter_columns=bloom_filter_columns,
|
||||
bloom_filter_fpp=bloom_filter_fpp
|
||||
)
|
||||
|
||||
with nogil:
|
||||
self.writer = move(GetResultValue(
|
||||
ORCFileWriter.Open(self.sink.get(),
|
||||
deref(write_options))))
|
||||
|
||||
def write(self, Table table):
|
||||
cdef:
|
||||
shared_ptr[CTable] sp_table
|
||||
sp_table = pyarrow_unwrap_table(table)
|
||||
with nogil:
|
||||
check_status(deref(self.writer).Write(deref(sp_table)))
|
||||
|
||||
def close(self):
|
||||
with nogil:
|
||||
check_status(deref(self.writer).Close())
|
||||
if self.own_sink:
|
||||
check_status(deref(self.sink).Close())
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_parquet.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_parquet.cpython-39-darwin.so
Executable file
Binary file not shown.
646
venv/lib/python3.9/site-packages/pyarrow/_parquet.pxd
Normal file
646
venv/lib/python3.9/site-packages/pyarrow/_parquet.pxd
Normal file
@@ -0,0 +1,646 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport (CChunkedArray, CScalar, CSchema, CStatus,
|
||||
CTable, CMemoryPool, CBuffer,
|
||||
CKeyValueMetadata,
|
||||
CRandomAccessFile, COutputStream,
|
||||
TimeUnit, CRecordBatchReader)
|
||||
from pyarrow.lib cimport _Weakrefable
|
||||
|
||||
|
||||
cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
|
||||
cdef cppclass Node:
|
||||
pass
|
||||
|
||||
cdef cppclass GroupNode(Node):
|
||||
pass
|
||||
|
||||
cdef cppclass PrimitiveNode(Node):
|
||||
pass
|
||||
|
||||
cdef cppclass ColumnPath:
|
||||
c_string ToDotString()
|
||||
vector[c_string] ToDotVector()
|
||||
|
||||
|
||||
cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
|
||||
enum ParquetType" parquet::Type::type":
|
||||
ParquetType_BOOLEAN" parquet::Type::BOOLEAN"
|
||||
ParquetType_INT32" parquet::Type::INT32"
|
||||
ParquetType_INT64" parquet::Type::INT64"
|
||||
ParquetType_INT96" parquet::Type::INT96"
|
||||
ParquetType_FLOAT" parquet::Type::FLOAT"
|
||||
ParquetType_DOUBLE" parquet::Type::DOUBLE"
|
||||
ParquetType_BYTE_ARRAY" parquet::Type::BYTE_ARRAY"
|
||||
ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"
|
||||
|
||||
enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
|
||||
ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
|
||||
ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
|
||||
ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
|
||||
ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
|
||||
ParquetLogicalType_ENUM" parquet::LogicalType::Type::ENUM"
|
||||
ParquetLogicalType_DECIMAL" parquet::LogicalType::Type::DECIMAL"
|
||||
ParquetLogicalType_DATE" parquet::LogicalType::Type::DATE"
|
||||
ParquetLogicalType_TIME" parquet::LogicalType::Type::TIME"
|
||||
ParquetLogicalType_TIMESTAMP" parquet::LogicalType::Type::TIMESTAMP"
|
||||
ParquetLogicalType_INT" parquet::LogicalType::Type::INT"
|
||||
ParquetLogicalType_JSON" parquet::LogicalType::Type::JSON"
|
||||
ParquetLogicalType_BSON" parquet::LogicalType::Type::BSON"
|
||||
ParquetLogicalType_UUID" parquet::LogicalType::Type::UUID"
|
||||
ParquetLogicalType_NONE" parquet::LogicalType::Type::NONE"
|
||||
|
||||
enum ParquetTimeUnit" parquet::LogicalType::TimeUnit::unit":
|
||||
ParquetTimeUnit_UNKNOWN" parquet::LogicalType::TimeUnit::UNKNOWN"
|
||||
ParquetTimeUnit_MILLIS" parquet::LogicalType::TimeUnit::MILLIS"
|
||||
ParquetTimeUnit_MICROS" parquet::LogicalType::TimeUnit::MICROS"
|
||||
ParquetTimeUnit_NANOS" parquet::LogicalType::TimeUnit::NANOS"
|
||||
|
||||
enum ParquetConvertedType" parquet::ConvertedType::type":
|
||||
ParquetConvertedType_NONE" parquet::ConvertedType::NONE"
|
||||
ParquetConvertedType_UTF8" parquet::ConvertedType::UTF8"
|
||||
ParquetConvertedType_MAP" parquet::ConvertedType::MAP"
|
||||
ParquetConvertedType_MAP_KEY_VALUE \
|
||||
" parquet::ConvertedType::MAP_KEY_VALUE"
|
||||
ParquetConvertedType_LIST" parquet::ConvertedType::LIST"
|
||||
ParquetConvertedType_ENUM" parquet::ConvertedType::ENUM"
|
||||
ParquetConvertedType_DECIMAL" parquet::ConvertedType::DECIMAL"
|
||||
ParquetConvertedType_DATE" parquet::ConvertedType::DATE"
|
||||
ParquetConvertedType_TIME_MILLIS" parquet::ConvertedType::TIME_MILLIS"
|
||||
ParquetConvertedType_TIME_MICROS" parquet::ConvertedType::TIME_MICROS"
|
||||
ParquetConvertedType_TIMESTAMP_MILLIS \
|
||||
" parquet::ConvertedType::TIMESTAMP_MILLIS"
|
||||
ParquetConvertedType_TIMESTAMP_MICROS \
|
||||
" parquet::ConvertedType::TIMESTAMP_MICROS"
|
||||
ParquetConvertedType_UINT_8" parquet::ConvertedType::UINT_8"
|
||||
ParquetConvertedType_UINT_16" parquet::ConvertedType::UINT_16"
|
||||
ParquetConvertedType_UINT_32" parquet::ConvertedType::UINT_32"
|
||||
ParquetConvertedType_UINT_64" parquet::ConvertedType::UINT_64"
|
||||
ParquetConvertedType_INT_8" parquet::ConvertedType::INT_8"
|
||||
ParquetConvertedType_INT_16" parquet::ConvertedType::INT_16"
|
||||
ParquetConvertedType_INT_32" parquet::ConvertedType::INT_32"
|
||||
ParquetConvertedType_INT_64" parquet::ConvertedType::INT_64"
|
||||
ParquetConvertedType_JSON" parquet::ConvertedType::JSON"
|
||||
ParquetConvertedType_BSON" parquet::ConvertedType::BSON"
|
||||
ParquetConvertedType_INTERVAL" parquet::ConvertedType::INTERVAL"
|
||||
|
||||
enum ParquetRepetition" parquet::Repetition::type":
|
||||
ParquetRepetition_REQUIRED" parquet::REPETITION::REQUIRED"
|
||||
ParquetRepetition_OPTIONAL" parquet::REPETITION::OPTIONAL"
|
||||
ParquetRepetition_REPEATED" parquet::REPETITION::REPEATED"
|
||||
|
||||
enum ParquetEncoding" parquet::Encoding::type":
|
||||
ParquetEncoding_PLAIN" parquet::Encoding::PLAIN"
|
||||
ParquetEncoding_PLAIN_DICTIONARY" parquet::Encoding::PLAIN_DICTIONARY"
|
||||
ParquetEncoding_RLE" parquet::Encoding::RLE"
|
||||
ParquetEncoding_BIT_PACKED" parquet::Encoding::BIT_PACKED"
|
||||
ParquetEncoding_DELTA_BINARY_PACKED \
|
||||
" parquet::Encoding::DELTA_BINARY_PACKED"
|
||||
ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY \
|
||||
" parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY"
|
||||
ParquetEncoding_DELTA_BYTE_ARRAY" parquet::Encoding::DELTA_BYTE_ARRAY"
|
||||
ParquetEncoding_RLE_DICTIONARY" parquet::Encoding::RLE_DICTIONARY"
|
||||
ParquetEncoding_BYTE_STREAM_SPLIT \
|
||||
" parquet::Encoding::BYTE_STREAM_SPLIT"
|
||||
|
||||
enum ParquetCompression" parquet::Compression::type":
|
||||
ParquetCompression_UNCOMPRESSED" parquet::Compression::UNCOMPRESSED"
|
||||
ParquetCompression_SNAPPY" parquet::Compression::SNAPPY"
|
||||
ParquetCompression_GZIP" parquet::Compression::GZIP"
|
||||
ParquetCompression_LZO" parquet::Compression::LZO"
|
||||
ParquetCompression_BROTLI" parquet::Compression::BROTLI"
|
||||
ParquetCompression_LZ4" parquet::Compression::LZ4"
|
||||
ParquetCompression_ZSTD" parquet::Compression::ZSTD"
|
||||
|
||||
enum ParquetVersion" parquet::ParquetVersion::type":
|
||||
ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
|
||||
ParquetVersion_V2_0" parquet::ParquetVersion::PARQUET_2_0"
|
||||
ParquetVersion_V2_4" parquet::ParquetVersion::PARQUET_2_4"
|
||||
ParquetVersion_V2_6" parquet::ParquetVersion::PARQUET_2_6"
|
||||
|
||||
enum ParquetSortOrder" parquet::SortOrder::type":
|
||||
ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED"
|
||||
ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED"
|
||||
ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN"
|
||||
|
||||
cdef cppclass CParquetLogicalType" parquet::LogicalType":
|
||||
c_string ToString() const
|
||||
c_string ToJSON() const
|
||||
ParquetLogicalTypeId type() const
|
||||
|
||||
cdef cppclass CParquetDecimalType \
|
||||
" parquet::DecimalLogicalType"(CParquetLogicalType):
|
||||
int32_t precision() const
|
||||
int32_t scale() const
|
||||
|
||||
cdef cppclass CParquetIntType \
|
||||
" parquet::IntLogicalType"(CParquetLogicalType):
|
||||
int bit_width() const
|
||||
c_bool is_signed() const
|
||||
|
||||
cdef cppclass CParquetTimeType \
|
||||
" parquet::TimeLogicalType"(CParquetLogicalType):
|
||||
c_bool is_adjusted_to_utc() const
|
||||
ParquetTimeUnit time_unit() const
|
||||
|
||||
cdef cppclass CParquetTimestampType \
|
||||
" parquet::TimestampLogicalType"(CParquetLogicalType):
|
||||
c_bool is_adjusted_to_utc() const
|
||||
ParquetTimeUnit time_unit() const
|
||||
|
||||
cdef cppclass ColumnDescriptor" parquet::ColumnDescriptor":
|
||||
c_bool Equals(const ColumnDescriptor& other)
|
||||
|
||||
shared_ptr[ColumnPath] path()
|
||||
int16_t max_definition_level()
|
||||
int16_t max_repetition_level()
|
||||
|
||||
ParquetType physical_type()
|
||||
const shared_ptr[const CParquetLogicalType]& logical_type()
|
||||
ParquetConvertedType converted_type()
|
||||
const c_string& name()
|
||||
int type_length()
|
||||
int type_precision()
|
||||
int type_scale()
|
||||
|
||||
cdef cppclass SchemaDescriptor:
|
||||
const ColumnDescriptor* Column(int i)
|
||||
shared_ptr[Node] schema()
|
||||
GroupNode* group()
|
||||
c_bool Equals(const SchemaDescriptor& other)
|
||||
c_string ToString()
|
||||
int num_columns()
|
||||
|
||||
cdef c_string FormatStatValue(ParquetType parquet_type, c_string val)
|
||||
|
||||
enum ParquetCipher" parquet::ParquetCipher::type":
|
||||
ParquetCipher_AES_GCM_V1" parquet::ParquetCipher::AES_GCM_V1"
|
||||
ParquetCipher_AES_GCM_CTR_V1" parquet::ParquetCipher::AES_GCM_CTR_V1"
|
||||
|
||||
struct AadMetadata:
|
||||
c_string aad_prefix
|
||||
c_string aad_file_unique
|
||||
c_bool supply_aad_prefix
|
||||
|
||||
struct EncryptionAlgorithm:
|
||||
ParquetCipher algorithm
|
||||
AadMetadata aad
|
||||
|
||||
cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
|
||||
cdef cppclass ColumnReader:
|
||||
pass
|
||||
|
||||
cdef cppclass BoolReader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass Int32Reader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass Int64Reader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass Int96Reader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass FloatReader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass DoubleReader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass ByteArrayReader(ColumnReader):
|
||||
pass
|
||||
|
||||
cdef cppclass RowGroupReader:
|
||||
pass
|
||||
|
||||
cdef cppclass CEncodedStatistics" parquet::EncodedStatistics":
|
||||
const c_string& max() const
|
||||
const c_string& min() const
|
||||
int64_t null_count
|
||||
int64_t distinct_count
|
||||
bint has_min
|
||||
bint has_max
|
||||
bint has_null_count
|
||||
bint has_distinct_count
|
||||
|
||||
cdef cppclass ParquetByteArray" parquet::ByteArray":
|
||||
uint32_t len
|
||||
const uint8_t* ptr
|
||||
|
||||
cdef cppclass ParquetFLBA" parquet::FLBA":
|
||||
const uint8_t* ptr
|
||||
|
||||
cdef cppclass CStatistics" parquet::Statistics":
|
||||
int64_t null_count() const
|
||||
int64_t distinct_count() const
|
||||
int64_t num_values() const
|
||||
bint HasMinMax()
|
||||
bint HasNullCount()
|
||||
bint HasDistinctCount()
|
||||
c_bool Equals(const CStatistics&) const
|
||||
void Reset()
|
||||
c_string EncodeMin()
|
||||
c_string EncodeMax()
|
||||
CEncodedStatistics Encode()
|
||||
void SetComparator()
|
||||
ParquetType physical_type() const
|
||||
const ColumnDescriptor* descr() const
|
||||
|
||||
cdef cppclass CBoolStatistics" parquet::BoolStatistics"(CStatistics):
|
||||
c_bool min()
|
||||
c_bool max()
|
||||
|
||||
cdef cppclass CInt32Statistics" parquet::Int32Statistics"(CStatistics):
|
||||
int32_t min()
|
||||
int32_t max()
|
||||
|
||||
cdef cppclass CInt64Statistics" parquet::Int64Statistics"(CStatistics):
|
||||
int64_t min()
|
||||
int64_t max()
|
||||
|
||||
cdef cppclass CFloatStatistics" parquet::FloatStatistics"(CStatistics):
|
||||
float min()
|
||||
float max()
|
||||
|
||||
cdef cppclass CDoubleStatistics" parquet::DoubleStatistics"(CStatistics):
|
||||
double min()
|
||||
double max()
|
||||
|
||||
cdef cppclass CByteArrayStatistics \
|
||||
" parquet::ByteArrayStatistics"(CStatistics):
|
||||
ParquetByteArray min()
|
||||
ParquetByteArray max()
|
||||
|
||||
cdef cppclass CFLBAStatistics" parquet::FLBAStatistics"(CStatistics):
|
||||
ParquetFLBA min()
|
||||
ParquetFLBA max()
|
||||
|
||||
cdef cppclass CColumnCryptoMetaData" parquet::ColumnCryptoMetaData":
|
||||
shared_ptr[ColumnPath] path_in_schema() const
|
||||
c_bool encrypted_with_footer_key() const
|
||||
const c_string& key_metadata() const
|
||||
|
||||
cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData":
|
||||
int64_t file_offset() const
|
||||
const c_string& file_path() const
|
||||
|
||||
c_bool is_metadata_set() const
|
||||
ParquetType type() const
|
||||
int64_t num_values() const
|
||||
shared_ptr[ColumnPath] path_in_schema() const
|
||||
bint is_stats_set() const
|
||||
shared_ptr[CStatistics] statistics() const
|
||||
ParquetCompression compression() const
|
||||
const vector[ParquetEncoding]& encodings() const
|
||||
c_bool Equals(const CColumnChunkMetaData&) const
|
||||
|
||||
int64_t has_dictionary_page() const
|
||||
int64_t dictionary_page_offset() const
|
||||
int64_t data_page_offset() const
|
||||
int64_t index_page_offset() const
|
||||
int64_t total_compressed_size() const
|
||||
int64_t total_uncompressed_size() const
|
||||
unique_ptr[CColumnCryptoMetaData] crypto_metadata() const
|
||||
|
||||
cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData":
|
||||
c_bool Equals(const CRowGroupMetaData&) const
|
||||
int num_columns()
|
||||
int64_t num_rows()
|
||||
int64_t total_byte_size()
|
||||
unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const
|
||||
|
||||
cdef cppclass CFileMetaData" parquet::FileMetaData":
|
||||
c_bool Equals(const CFileMetaData&) const
|
||||
uint32_t size()
|
||||
int num_columns()
|
||||
int64_t num_rows()
|
||||
int num_row_groups()
|
||||
ParquetVersion version()
|
||||
const c_string created_by()
|
||||
int num_schema_elements()
|
||||
|
||||
void set_file_path(const c_string& path)
|
||||
void AppendRowGroups(const CFileMetaData& other) except +
|
||||
|
||||
unique_ptr[CRowGroupMetaData] RowGroup(int i)
|
||||
const SchemaDescriptor* schema()
|
||||
shared_ptr[const CKeyValueMetadata] key_value_metadata() const
|
||||
void WriteTo(COutputStream* dst) const
|
||||
|
||||
inline c_bool is_encryption_algorithm_set() const
|
||||
inline EncryptionAlgorithm encryption_algorithm() const
|
||||
inline const c_string& footer_signing_key_metadata() const
|
||||
|
||||
cdef shared_ptr[CFileMetaData] CFileMetaData_Make \
|
||||
" parquet::FileMetaData::Make"(const void* serialized_metadata,
|
||||
uint32_t* metadata_len)
|
||||
|
||||
cdef cppclass CReaderProperties" parquet::ReaderProperties":
|
||||
c_bool is_buffered_stream_enabled() const
|
||||
void enable_buffered_stream()
|
||||
void disable_buffered_stream()
|
||||
|
||||
void set_buffer_size(int64_t buf_size)
|
||||
int64_t buffer_size() const
|
||||
|
||||
void set_thrift_string_size_limit(int32_t size)
|
||||
int32_t thrift_string_size_limit() const
|
||||
|
||||
void set_thrift_container_size_limit(int32_t size)
|
||||
int32_t thrift_container_size_limit() const
|
||||
|
||||
void file_decryption_properties(shared_ptr[CFileDecryptionProperties]
|
||||
decryption)
|
||||
shared_ptr[CFileDecryptionProperties] file_decryption_properties() \
|
||||
const
|
||||
|
||||
CReaderProperties default_reader_properties()
|
||||
|
||||
cdef cppclass ArrowReaderProperties:
|
||||
ArrowReaderProperties()
|
||||
void set_read_dictionary(int column_index, c_bool read_dict)
|
||||
c_bool read_dictionary()
|
||||
void set_batch_size(int64_t batch_size)
|
||||
int64_t batch_size()
|
||||
void set_pre_buffer(c_bool pre_buffer)
|
||||
c_bool pre_buffer() const
|
||||
void set_coerce_int96_timestamp_unit(TimeUnit unit)
|
||||
TimeUnit coerce_int96_timestamp_unit() const
|
||||
|
||||
ArrowReaderProperties default_arrow_reader_properties()
|
||||
|
||||
cdef cppclass ParquetFileReader:
|
||||
shared_ptr[CFileMetaData] metadata()
|
||||
|
||||
|
||||
cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
|
||||
cdef cppclass WriterProperties:
|
||||
cppclass Builder:
|
||||
Builder* data_page_version(ParquetDataPageVersion version)
|
||||
Builder* version(ParquetVersion version)
|
||||
Builder* compression(ParquetCompression codec)
|
||||
Builder* compression(const c_string& path,
|
||||
ParquetCompression codec)
|
||||
Builder* compression_level(int compression_level)
|
||||
Builder* compression_level(const c_string& path,
|
||||
int compression_level)
|
||||
Builder* encryption(
|
||||
shared_ptr[CFileEncryptionProperties]
|
||||
file_encryption_properties)
|
||||
Builder* disable_dictionary()
|
||||
Builder* enable_dictionary()
|
||||
Builder* enable_dictionary(const c_string& path)
|
||||
Builder* disable_statistics()
|
||||
Builder* enable_statistics()
|
||||
Builder* enable_statistics(const c_string& path)
|
||||
Builder* data_pagesize(int64_t size)
|
||||
Builder* encoding(ParquetEncoding encoding)
|
||||
Builder* encoding(const c_string& path,
|
||||
ParquetEncoding encoding)
|
||||
Builder* write_batch_size(int64_t batch_size)
|
||||
Builder* dictionary_pagesize_limit(int64_t dictionary_pagesize_limit)
|
||||
shared_ptr[WriterProperties] build()
|
||||
|
||||
cdef cppclass ArrowWriterProperties:
|
||||
cppclass Builder:
|
||||
Builder()
|
||||
Builder* disable_deprecated_int96_timestamps()
|
||||
Builder* enable_deprecated_int96_timestamps()
|
||||
Builder* coerce_timestamps(TimeUnit unit)
|
||||
Builder* allow_truncated_timestamps()
|
||||
Builder* disallow_truncated_timestamps()
|
||||
Builder* store_schema()
|
||||
Builder* enable_compliant_nested_types()
|
||||
Builder* disable_compliant_nested_types()
|
||||
Builder* set_engine_version(ArrowWriterEngineVersion version)
|
||||
shared_ptr[ArrowWriterProperties] build()
|
||||
c_bool support_deprecated_int96_timestamps()
|
||||
|
||||
|
||||
cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil:
|
||||
cdef cppclass FileReader:
|
||||
FileReader(CMemoryPool* pool, unique_ptr[ParquetFileReader] reader)
|
||||
|
||||
CStatus GetSchema(shared_ptr[CSchema]* out)
|
||||
|
||||
CStatus ReadColumn(int i, shared_ptr[CChunkedArray]* out)
|
||||
CStatus ReadSchemaField(int i, shared_ptr[CChunkedArray]* out)
|
||||
|
||||
int num_row_groups()
|
||||
CStatus ReadRowGroup(int i, shared_ptr[CTable]* out)
|
||||
CStatus ReadRowGroup(int i, const vector[int]& column_indices,
|
||||
shared_ptr[CTable]* out)
|
||||
|
||||
CStatus ReadRowGroups(const vector[int]& row_groups,
|
||||
shared_ptr[CTable]* out)
|
||||
CStatus ReadRowGroups(const vector[int]& row_groups,
|
||||
const vector[int]& column_indices,
|
||||
shared_ptr[CTable]* out)
|
||||
|
||||
CStatus GetRecordBatchReader(const vector[int]& row_group_indices,
|
||||
const vector[int]& column_indices,
|
||||
unique_ptr[CRecordBatchReader]* out)
|
||||
CStatus GetRecordBatchReader(const vector[int]& row_group_indices,
|
||||
unique_ptr[CRecordBatchReader]* out)
|
||||
|
||||
CStatus ReadTable(shared_ptr[CTable]* out)
|
||||
CStatus ReadTable(const vector[int]& column_indices,
|
||||
shared_ptr[CTable]* out)
|
||||
|
||||
CStatus ScanContents(vector[int] columns, int32_t column_batch_size,
|
||||
int64_t* num_rows)
|
||||
|
||||
const ParquetFileReader* parquet_reader()
|
||||
|
||||
void set_use_threads(c_bool use_threads)
|
||||
|
||||
void set_batch_size(int64_t batch_size)
|
||||
|
||||
cdef cppclass FileReaderBuilder:
|
||||
FileReaderBuilder()
|
||||
CStatus Open(const shared_ptr[CRandomAccessFile]& file,
|
||||
const CReaderProperties& properties,
|
||||
const shared_ptr[CFileMetaData]& metadata)
|
||||
|
||||
ParquetFileReader* raw_reader()
|
||||
FileReaderBuilder* memory_pool(CMemoryPool*)
|
||||
FileReaderBuilder* properties(const ArrowReaderProperties&)
|
||||
CStatus Build(unique_ptr[FileReader]* out)
|
||||
|
||||
CStatus FromParquetSchema(
|
||||
const SchemaDescriptor* parquet_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
const shared_ptr[const CKeyValueMetadata]& key_value_metadata,
|
||||
shared_ptr[CSchema]* out)
|
||||
|
||||
CStatus StatisticsAsScalars(const CStatistics& Statistics,
|
||||
shared_ptr[CScalar]* min,
|
||||
shared_ptr[CScalar]* max)
|
||||
|
||||
cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil:
|
||||
|
||||
CStatus ToParquetSchema(
|
||||
const CSchema* arrow_schema,
|
||||
const ArrowReaderProperties& properties,
|
||||
const shared_ptr[const CKeyValueMetadata]& key_value_metadata,
|
||||
shared_ptr[SchemaDescriptor]* out)
|
||||
|
||||
|
||||
cdef extern from "parquet/properties.h" namespace "parquet" nogil:
|
||||
cdef enum ArrowWriterEngineVersion:
|
||||
V1 "parquet::ArrowWriterProperties::V1",
|
||||
V2 "parquet::ArrowWriterProperties::V2"
|
||||
|
||||
cdef cppclass ParquetDataPageVersion:
|
||||
pass
|
||||
|
||||
cdef ParquetDataPageVersion ParquetDataPageVersion_V1 \
|
||||
" parquet::ParquetDataPageVersion::V1"
|
||||
cdef ParquetDataPageVersion ParquetDataPageVersion_V2 \
|
||||
" parquet::ParquetDataPageVersion::V2"
|
||||
|
||||
cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil:
|
||||
cdef cppclass FileWriter:
|
||||
|
||||
@staticmethod
|
||||
CResult[unique_ptr[FileWriter]] Open(const CSchema& schema, CMemoryPool* pool,
|
||||
const shared_ptr[COutputStream]& sink,
|
||||
const shared_ptr[WriterProperties]& properties,
|
||||
const shared_ptr[ArrowWriterProperties]& arrow_properties)
|
||||
|
||||
CStatus WriteTable(const CTable& table, int64_t chunk_size)
|
||||
CStatus NewRowGroup(int64_t chunk_size)
|
||||
CStatus Close()
|
||||
|
||||
const shared_ptr[CFileMetaData] metadata() const
|
||||
|
||||
CStatus WriteMetaDataFile(
|
||||
const CFileMetaData& file_metadata,
|
||||
const COutputStream* sink)
|
||||
|
||||
cdef class FileEncryptionProperties:
|
||||
"""File-level encryption properties for the low-level API"""
|
||||
cdef:
|
||||
shared_ptr[CFileEncryptionProperties] properties
|
||||
|
||||
@staticmethod
|
||||
cdef inline FileEncryptionProperties wrap(
|
||||
shared_ptr[CFileEncryptionProperties] properties):
|
||||
|
||||
result = FileEncryptionProperties()
|
||||
result.properties = properties
|
||||
return result
|
||||
|
||||
cdef inline shared_ptr[CFileEncryptionProperties] unwrap(self):
|
||||
return self.properties
|
||||
|
||||
cdef shared_ptr[WriterProperties] _create_writer_properties(
|
||||
use_dictionary=*,
|
||||
compression=*,
|
||||
version=*,
|
||||
write_statistics=*,
|
||||
data_page_size=*,
|
||||
compression_level=*,
|
||||
use_byte_stream_split=*,
|
||||
column_encoding=*,
|
||||
data_page_version=*,
|
||||
FileEncryptionProperties encryption_properties=*,
|
||||
write_batch_size=*,
|
||||
dictionary_pagesize_limit=*) except *
|
||||
|
||||
|
||||
cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
|
||||
use_deprecated_int96_timestamps=*,
|
||||
coerce_timestamps=*,
|
||||
allow_truncated_timestamps=*,
|
||||
writer_engine_version=*,
|
||||
use_compliant_nested_type=*,
|
||||
store_schema=*) except *
|
||||
|
||||
cdef class ParquetSchema(_Weakrefable):
|
||||
cdef:
|
||||
FileMetaData parent # the FileMetaData owning the SchemaDescriptor
|
||||
const SchemaDescriptor* schema
|
||||
|
||||
cdef class FileMetaData(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CFileMetaData] sp_metadata
|
||||
CFileMetaData* _metadata
|
||||
ParquetSchema _schema
|
||||
|
||||
cdef inline init(self, const shared_ptr[CFileMetaData]& metadata):
|
||||
self.sp_metadata = metadata
|
||||
self._metadata = metadata.get()
|
||||
|
||||
cdef class RowGroupMetaData(_Weakrefable):
|
||||
cdef:
|
||||
int index # for pickling support
|
||||
unique_ptr[CRowGroupMetaData] up_metadata
|
||||
CRowGroupMetaData* metadata
|
||||
FileMetaData parent
|
||||
|
||||
cdef class ColumnChunkMetaData(_Weakrefable):
|
||||
cdef:
|
||||
unique_ptr[CColumnChunkMetaData] up_metadata
|
||||
CColumnChunkMetaData* metadata
|
||||
RowGroupMetaData parent
|
||||
|
||||
cdef inline init(self, RowGroupMetaData parent, int i):
|
||||
self.up_metadata = parent.metadata.ColumnChunk(i)
|
||||
self.metadata = self.up_metadata.get()
|
||||
self.parent = parent
|
||||
|
||||
cdef class Statistics(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CStatistics] statistics
|
||||
ColumnChunkMetaData parent
|
||||
|
||||
cdef inline init(self, const shared_ptr[CStatistics]& statistics,
|
||||
ColumnChunkMetaData parent):
|
||||
self.statistics = statistics
|
||||
self.parent = parent
|
||||
|
||||
cdef extern from "parquet/encryption/encryption.h" namespace "parquet" nogil:
|
||||
cdef cppclass CFileDecryptionProperties\
|
||||
" parquet::FileDecryptionProperties":
|
||||
pass
|
||||
|
||||
cdef cppclass CFileEncryptionProperties\
|
||||
" parquet::FileEncryptionProperties":
|
||||
pass
|
||||
|
||||
cdef class FileDecryptionProperties:
|
||||
"""File-level decryption properties for the low-level API"""
|
||||
cdef:
|
||||
shared_ptr[CFileDecryptionProperties] properties
|
||||
|
||||
@staticmethod
|
||||
cdef inline FileDecryptionProperties wrap(
|
||||
shared_ptr[CFileDecryptionProperties] properties):
|
||||
|
||||
result = FileDecryptionProperties()
|
||||
result.properties = properties
|
||||
return result
|
||||
|
||||
cdef inline shared_ptr[CFileDecryptionProperties] unwrap(self):
|
||||
return self.properties
|
||||
1791
venv/lib/python3.9/site-packages/pyarrow/_parquet.pyx
Normal file
1791
venv/lib/python3.9/site-packages/pyarrow/_parquet.pyx
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
133
venv/lib/python3.9/site-packages/pyarrow/_parquet_encryption.pxd
Normal file
133
venv/lib/python3.9/site-packages/pyarrow/_parquet_encryption.pxd
Normal file
@@ -0,0 +1,133 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow._parquet cimport (ParquetCipher,
|
||||
CFileEncryptionProperties,
|
||||
CFileDecryptionProperties,
|
||||
FileEncryptionProperties,
|
||||
FileDecryptionProperties,
|
||||
ParquetCipher_AES_GCM_V1,
|
||||
ParquetCipher_AES_GCM_CTR_V1)
|
||||
|
||||
|
||||
cdef extern from "parquet/encryption/kms_client.h" \
|
||||
namespace "parquet::encryption" nogil:
|
||||
cdef cppclass CKmsClient" parquet::encryption::KmsClient":
|
||||
c_string WrapKey(const c_string& key_bytes,
|
||||
const c_string& master_key_identifier) except +
|
||||
c_string UnwrapKey(const c_string& wrapped_key,
|
||||
const c_string& master_key_identifier) except +
|
||||
|
||||
cdef cppclass CKeyAccessToken" parquet::encryption::KeyAccessToken":
|
||||
CKeyAccessToken(const c_string value)
|
||||
void Refresh(const c_string& new_value)
|
||||
const c_string& value() const
|
||||
|
||||
cdef cppclass CKmsConnectionConfig \
|
||||
" parquet::encryption::KmsConnectionConfig":
|
||||
CKmsConnectionConfig()
|
||||
c_string kms_instance_id
|
||||
c_string kms_instance_url
|
||||
shared_ptr[CKeyAccessToken] refreshable_key_access_token
|
||||
unordered_map[c_string, c_string] custom_kms_conf
|
||||
|
||||
# Callbacks for implementing Python kms clients
|
||||
# Use typedef to emulate syntax for std::function<void(..)>
|
||||
ctypedef void CallbackWrapKey(
|
||||
object, const c_string&, const c_string&, c_string*)
|
||||
ctypedef void CallbackUnwrapKey(
|
||||
object, const c_string&, const c_string&, c_string*)
|
||||
|
||||
cdef extern from "parquet/encryption/kms_client_factory.h" \
|
||||
namespace "parquet::encryption" nogil:
|
||||
cdef cppclass CKmsClientFactory" parquet::encryption::KmsClientFactory":
|
||||
shared_ptr[CKmsClient] CreateKmsClient(
|
||||
const CKmsConnectionConfig& kms_connection_config) except +
|
||||
|
||||
# Callbacks for implementing Python kms client factories
|
||||
# Use typedef to emulate syntax for std::function<void(..)>
|
||||
ctypedef void CallbackCreateKmsClient(
|
||||
object,
|
||||
const CKmsConnectionConfig&, shared_ptr[CKmsClient]*)
|
||||
|
||||
cdef extern from "parquet/encryption/crypto_factory.h" \
|
||||
namespace "parquet::encryption" nogil:
|
||||
cdef cppclass CEncryptionConfiguration\
|
||||
" parquet::encryption::EncryptionConfiguration":
|
||||
CEncryptionConfiguration(const c_string& footer_key) except +
|
||||
c_string footer_key
|
||||
c_string column_keys
|
||||
ParquetCipher encryption_algorithm
|
||||
c_bool plaintext_footer
|
||||
c_bool double_wrapping
|
||||
double cache_lifetime_seconds
|
||||
c_bool internal_key_material
|
||||
int32_t data_key_length_bits
|
||||
|
||||
cdef cppclass CDecryptionConfiguration\
|
||||
" parquet::encryption::DecryptionConfiguration":
|
||||
CDecryptionConfiguration() except +
|
||||
double cache_lifetime_seconds
|
||||
|
||||
cdef cppclass CCryptoFactory" parquet::encryption::CryptoFactory":
|
||||
void RegisterKmsClientFactory(
|
||||
shared_ptr[CKmsClientFactory] kms_client_factory) except +
|
||||
shared_ptr[CFileEncryptionProperties] GetFileEncryptionProperties(
|
||||
const CKmsConnectionConfig& kms_connection_config,
|
||||
const CEncryptionConfiguration& encryption_config) except +*
|
||||
shared_ptr[CFileDecryptionProperties] GetFileDecryptionProperties(
|
||||
const CKmsConnectionConfig& kms_connection_config,
|
||||
const CDecryptionConfiguration& decryption_config) except +*
|
||||
void RemoveCacheEntriesForToken(const c_string& access_token) except +
|
||||
void RemoveCacheEntriesForAllTokens() except +
|
||||
|
||||
cdef extern from "arrow/python/parquet_encryption.h" \
|
||||
namespace "arrow::py::parquet::encryption" nogil:
|
||||
cdef cppclass CPyKmsClientVtable \
|
||||
" arrow::py::parquet::encryption::PyKmsClientVtable":
|
||||
CPyKmsClientVtable()
|
||||
function[CallbackWrapKey] wrap_key
|
||||
function[CallbackUnwrapKey] unwrap_key
|
||||
|
||||
cdef cppclass CPyKmsClient\
|
||||
" arrow::py::parquet::encryption::PyKmsClient"(CKmsClient):
|
||||
CPyKmsClient(object handler, CPyKmsClientVtable vtable)
|
||||
|
||||
cdef cppclass CPyKmsClientFactoryVtable\
|
||||
" arrow::py::parquet::encryption::PyKmsClientFactoryVtable":
|
||||
CPyKmsClientFactoryVtable()
|
||||
function[CallbackCreateKmsClient] create_kms_client
|
||||
|
||||
cdef cppclass CPyKmsClientFactory\
|
||||
" arrow::py::parquet::encryption::PyKmsClientFactory"(
|
||||
CKmsClientFactory):
|
||||
CPyKmsClientFactory(object handler, CPyKmsClientFactoryVtable vtable)
|
||||
|
||||
cdef cppclass CPyCryptoFactory\
|
||||
" arrow::py::parquet::encryption::PyCryptoFactory"(CCryptoFactory):
|
||||
CResult[shared_ptr[CFileEncryptionProperties]] \
|
||||
SafeGetFileEncryptionProperties(
|
||||
const CKmsConnectionConfig& kms_connection_config,
|
||||
const CEncryptionConfiguration& encryption_config)
|
||||
CResult[shared_ptr[CFileDecryptionProperties]] \
|
||||
SafeGetFileDecryptionProperties(
|
||||
const CKmsConnectionConfig& kms_connection_config,
|
||||
const CDecryptionConfiguration& decryption_config)
|
||||
475
venv/lib/python3.9/site-packages/pyarrow/_parquet_encryption.pyx
Normal file
475
venv/lib/python3.9/site-packages/pyarrow/_parquet_encryption.pyx
Normal file
@@ -0,0 +1,475 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
|
||||
from datetime import timedelta
|
||||
import io
|
||||
import warnings
|
||||
|
||||
from libcpp cimport nullptr
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport _Weakrefable
|
||||
|
||||
from pyarrow.lib import (ArrowException,
|
||||
tobytes, frombytes)
|
||||
|
||||
cimport cpython as cp
|
||||
|
||||
|
||||
cdef ParquetCipher cipher_from_name(name):
|
||||
name = name.upper()
|
||||
if name == 'AES_GCM_V1':
|
||||
return ParquetCipher_AES_GCM_V1
|
||||
elif name == 'AES_GCM_CTR_V1':
|
||||
return ParquetCipher_AES_GCM_CTR_V1
|
||||
else:
|
||||
raise ValueError(f'Invalid cipher name: {name!r}')
|
||||
|
||||
|
||||
cdef cipher_to_name(ParquetCipher cipher):
|
||||
if ParquetCipher_AES_GCM_V1 == cipher:
|
||||
return 'AES_GCM_V1'
|
||||
elif ParquetCipher_AES_GCM_CTR_V1 == cipher:
|
||||
return 'AES_GCM_CTR_V1'
|
||||
else:
|
||||
raise ValueError('Invalid cipher value: {0}'.format(cipher))
|
||||
|
||||
cdef class EncryptionConfiguration(_Weakrefable):
|
||||
"""Configuration of the encryption, such as which columns to encrypt"""
|
||||
cdef:
|
||||
shared_ptr[CEncryptionConfiguration] configuration
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, footer_key, *, column_keys=None,
|
||||
encryption_algorithm=None,
|
||||
plaintext_footer=None, double_wrapping=None,
|
||||
cache_lifetime=None, internal_key_material=None,
|
||||
data_key_length_bits=None):
|
||||
self.configuration.reset(
|
||||
new CEncryptionConfiguration(tobytes(footer_key)))
|
||||
if column_keys is not None:
|
||||
self.column_keys = column_keys
|
||||
if encryption_algorithm is not None:
|
||||
self.encryption_algorithm = encryption_algorithm
|
||||
if plaintext_footer is not None:
|
||||
self.plaintext_footer = plaintext_footer
|
||||
if double_wrapping is not None:
|
||||
self.double_wrapping = double_wrapping
|
||||
if cache_lifetime is not None:
|
||||
self.cache_lifetime = cache_lifetime
|
||||
if internal_key_material is not None:
|
||||
self.internal_key_material = internal_key_material
|
||||
if data_key_length_bits is not None:
|
||||
self.data_key_length_bits = data_key_length_bits
|
||||
|
||||
@property
|
||||
def footer_key(self):
|
||||
"""ID of the master key for footer encryption/signing"""
|
||||
return frombytes(self.configuration.get().footer_key)
|
||||
|
||||
@property
|
||||
def column_keys(self):
|
||||
"""
|
||||
List of columns to encrypt, with master key IDs.
|
||||
"""
|
||||
column_keys_str = frombytes(self.configuration.get().column_keys)
|
||||
# Convert from "masterKeyID:colName,colName;masterKeyID:colName..."
|
||||
# (see HIVE-21848) to dictionary of master key ID to column name lists
|
||||
column_keys_to_key_list_str = dict(subString.replace(" ", "").split(
|
||||
":") for subString in column_keys_str.split(";"))
|
||||
column_keys_dict = {k: v.split(
|
||||
",") for k, v in column_keys_to_key_list_str.items()}
|
||||
return column_keys_dict
|
||||
|
||||
@column_keys.setter
|
||||
def column_keys(self, dict value):
|
||||
if value is not None:
|
||||
# convert a dictionary such as
|
||||
# '{"key1": ["col1 ", "col2"], "key2": ["col3 ", "col4"]}''
|
||||
# to the string defined by the spec
|
||||
# 'key1: col1 , col2; key2: col3 , col4'
|
||||
column_keys = "; ".join(
|
||||
["{}: {}".format(k, ", ".join(v)) for k, v in value.items()])
|
||||
self.configuration.get().column_keys = tobytes(column_keys)
|
||||
|
||||
@property
|
||||
def encryption_algorithm(self):
|
||||
"""Parquet encryption algorithm.
|
||||
Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1"."""
|
||||
return cipher_to_name(self.configuration.get().encryption_algorithm)
|
||||
|
||||
@encryption_algorithm.setter
|
||||
def encryption_algorithm(self, value):
|
||||
cipher = cipher_from_name(value)
|
||||
self.configuration.get().encryption_algorithm = cipher
|
||||
|
||||
@property
|
||||
def plaintext_footer(self):
|
||||
"""Write files with plaintext footer."""
|
||||
return self.configuration.get().plaintext_footer
|
||||
|
||||
@plaintext_footer.setter
|
||||
def plaintext_footer(self, value):
|
||||
self.configuration.get().plaintext_footer = value
|
||||
|
||||
@property
|
||||
def double_wrapping(self):
|
||||
"""Use double wrapping - where data encryption keys (DEKs) are
|
||||
encrypted with key encryption keys (KEKs), which in turn are
|
||||
encrypted with master keys.
|
||||
If set to false, use single wrapping - where DEKs are
|
||||
encrypted directly with master keys."""
|
||||
return self.configuration.get().double_wrapping
|
||||
|
||||
@double_wrapping.setter
|
||||
def double_wrapping(self, value):
|
||||
self.configuration.get().double_wrapping = value
|
||||
|
||||
@property
|
||||
def cache_lifetime(self):
|
||||
"""Lifetime of cached entities (key encryption keys,
|
||||
local wrapping keys, KMS client objects)."""
|
||||
return timedelta(
|
||||
seconds=self.configuration.get().cache_lifetime_seconds)
|
||||
|
||||
@cache_lifetime.setter
|
||||
def cache_lifetime(self, value):
|
||||
if not isinstance(value, timedelta):
|
||||
raise TypeError("cache_lifetime should be a timedelta")
|
||||
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
|
||||
|
||||
@property
|
||||
def internal_key_material(self):
|
||||
"""Store key material inside Parquet file footers; this mode doesn’t
|
||||
produce additional files. If set to false, key material is stored in
|
||||
separate files in the same folder, which enables key rotation for
|
||||
immutable Parquet files."""
|
||||
return self.configuration.get().internal_key_material
|
||||
|
||||
@internal_key_material.setter
|
||||
def internal_key_material(self, value):
|
||||
self.configuration.get().internal_key_material = value
|
||||
|
||||
@property
|
||||
def data_key_length_bits(self):
|
||||
"""Length of data encryption keys (DEKs), randomly generated by parquet key
|
||||
management tools. Can be 128, 192 or 256 bits."""
|
||||
return self.configuration.get().data_key_length_bits
|
||||
|
||||
@data_key_length_bits.setter
|
||||
def data_key_length_bits(self, value):
|
||||
self.configuration.get().data_key_length_bits = value
|
||||
|
||||
cdef inline shared_ptr[CEncryptionConfiguration] unwrap(self) nogil:
|
||||
return self.configuration
|
||||
|
||||
|
||||
cdef class DecryptionConfiguration(_Weakrefable):
|
||||
"""Configuration of the decryption, such as cache timeout."""
|
||||
cdef:
|
||||
shared_ptr[CDecryptionConfiguration] configuration
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, *, cache_lifetime=None):
|
||||
self.configuration.reset(new CDecryptionConfiguration())
|
||||
|
||||
@property
|
||||
def cache_lifetime(self):
|
||||
"""Lifetime of cached entities (key encryption keys,
|
||||
local wrapping keys, KMS client objects)."""
|
||||
return timedelta(
|
||||
seconds=self.configuration.get().cache_lifetime_seconds)
|
||||
|
||||
@cache_lifetime.setter
|
||||
def cache_lifetime(self, value):
|
||||
self.configuration.get().cache_lifetime_seconds = value.total_seconds()
|
||||
|
||||
cdef inline shared_ptr[CDecryptionConfiguration] unwrap(self) nogil:
|
||||
return self.configuration
|
||||
|
||||
|
||||
cdef class KmsConnectionConfig(_Weakrefable):
|
||||
"""Configuration of the connection to the Key Management Service (KMS)"""
|
||||
cdef:
|
||||
shared_ptr[CKmsConnectionConfig] configuration
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, *, kms_instance_id=None, kms_instance_url=None,
|
||||
key_access_token=None, custom_kms_conf=None):
|
||||
self.configuration.reset(new CKmsConnectionConfig())
|
||||
if kms_instance_id is not None:
|
||||
self.kms_instance_id = kms_instance_id
|
||||
if kms_instance_url is not None:
|
||||
self.kms_instance_url = kms_instance_url
|
||||
if key_access_token is None:
|
||||
self.key_access_token = b'DEFAULT'
|
||||
else:
|
||||
self.key_access_token = key_access_token
|
||||
if custom_kms_conf is not None:
|
||||
self.custom_kms_conf = custom_kms_conf
|
||||
|
||||
@property
|
||||
def kms_instance_id(self):
|
||||
"""ID of the KMS instance that will be used for encryption
|
||||
(if multiple KMS instances are available)."""
|
||||
return frombytes(self.configuration.get().kms_instance_id)
|
||||
|
||||
@kms_instance_id.setter
|
||||
def kms_instance_id(self, value):
|
||||
self.configuration.get().kms_instance_id = tobytes(value)
|
||||
|
||||
@property
|
||||
def kms_instance_url(self):
|
||||
"""URL of the KMS instance."""
|
||||
return frombytes(self.configuration.get().kms_instance_url)
|
||||
|
||||
@kms_instance_url.setter
|
||||
def kms_instance_url(self, value):
|
||||
self.configuration.get().kms_instance_url = tobytes(value)
|
||||
|
||||
@property
|
||||
def key_access_token(self):
|
||||
"""Authorization token that will be passed to KMS."""
|
||||
return frombytes(self.configuration.get()
|
||||
.refreshable_key_access_token.get().value())
|
||||
|
||||
@key_access_token.setter
|
||||
def key_access_token(self, value):
|
||||
self.refresh_key_access_token(value)
|
||||
|
||||
@property
|
||||
def custom_kms_conf(self):
|
||||
"""A dictionary with KMS-type-specific configuration"""
|
||||
custom_kms_conf = {
|
||||
frombytes(k): frombytes(v)
|
||||
for k, v in self.configuration.get().custom_kms_conf
|
||||
}
|
||||
return custom_kms_conf
|
||||
|
||||
@custom_kms_conf.setter
|
||||
def custom_kms_conf(self, dict value):
|
||||
if value is not None:
|
||||
for k, v in value.items():
|
||||
if isinstance(k, str) and isinstance(v, str):
|
||||
self.configuration.get().custom_kms_conf[tobytes(k)] = \
|
||||
tobytes(v)
|
||||
else:
|
||||
raise TypeError("Expected custom_kms_conf to be " +
|
||||
"a dictionary of strings")
|
||||
|
||||
def refresh_key_access_token(self, value):
|
||||
cdef:
|
||||
shared_ptr[CKeyAccessToken] c_key_access_token = \
|
||||
self.configuration.get().refreshable_key_access_token
|
||||
|
||||
c_key_access_token.get().Refresh(tobytes(value))
|
||||
|
||||
cdef inline shared_ptr[CKmsConnectionConfig] unwrap(self) nogil:
|
||||
return self.configuration
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(const CKmsConnectionConfig& config):
|
||||
result = KmsConnectionConfig()
|
||||
result.configuration = make_shared[CKmsConnectionConfig](move(config))
|
||||
return result
|
||||
|
||||
|
||||
# Callback definitions for CPyKmsClientVtable
|
||||
cdef void _cb_wrap_key(
|
||||
handler, const c_string& key_bytes,
|
||||
const c_string& master_key_identifier, c_string* out) except *:
|
||||
mkid_str = frombytes(master_key_identifier)
|
||||
wrapped_key = handler.wrap_key(key_bytes, mkid_str)
|
||||
out[0] = tobytes(wrapped_key)
|
||||
|
||||
|
||||
cdef void _cb_unwrap_key(
|
||||
handler, const c_string& wrapped_key,
|
||||
const c_string& master_key_identifier, c_string* out) except *:
|
||||
mkid_str = frombytes(master_key_identifier)
|
||||
wk_str = frombytes(wrapped_key)
|
||||
key = handler.unwrap_key(wk_str, mkid_str)
|
||||
out[0] = tobytes(key)
|
||||
|
||||
|
||||
cdef class KmsClient(_Weakrefable):
|
||||
"""The abstract base class for KmsClient implementations."""
|
||||
cdef:
|
||||
shared_ptr[CKmsClient] client
|
||||
|
||||
def __init__(self):
|
||||
self.init()
|
||||
|
||||
cdef init(self):
|
||||
cdef:
|
||||
CPyKmsClientVtable vtable = CPyKmsClientVtable()
|
||||
|
||||
vtable.wrap_key = _cb_wrap_key
|
||||
vtable.unwrap_key = _cb_unwrap_key
|
||||
|
||||
self.client.reset(new CPyKmsClient(self, vtable))
|
||||
|
||||
def wrap_key(self, key_bytes, master_key_identifier):
|
||||
"""Wrap a key - encrypt it with the master key."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def unwrap_key(self, wrapped_key, master_key_identifier):
|
||||
"""Unwrap a key - decrypt it with the master key."""
|
||||
raise NotImplementedError()
|
||||
|
||||
cdef inline shared_ptr[CKmsClient] unwrap(self) nogil:
|
||||
return self.client
|
||||
|
||||
|
||||
# Callback definition for CPyKmsClientFactoryVtable
|
||||
cdef void _cb_create_kms_client(
|
||||
handler,
|
||||
const CKmsConnectionConfig& kms_connection_config,
|
||||
shared_ptr[CKmsClient]* out) except *:
|
||||
connection_config = KmsConnectionConfig.wrap(kms_connection_config)
|
||||
|
||||
result = handler(connection_config)
|
||||
if not isinstance(result, KmsClient):
|
||||
raise TypeError(
|
||||
"callable must return KmsClient instances, but got {}".format(
|
||||
type(result)))
|
||||
|
||||
out[0] = (<KmsClient> result).unwrap()
|
||||
|
||||
|
||||
cdef class CryptoFactory(_Weakrefable):
|
||||
""" A factory that produces the low-level FileEncryptionProperties and
|
||||
FileDecryptionProperties objects, from the high-level parameters."""
|
||||
cdef:
|
||||
unique_ptr[CPyCryptoFactory] factory
|
||||
|
||||
# Avoid mistakingly creating attributes
|
||||
__slots__ = ()
|
||||
|
||||
def __init__(self, kms_client_factory):
|
||||
"""Create CryptoFactory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kms_client_factory : a callable that accepts KmsConnectionConfig
|
||||
and returns a KmsClient
|
||||
"""
|
||||
self.factory.reset(new CPyCryptoFactory())
|
||||
|
||||
if callable(kms_client_factory):
|
||||
self.init(kms_client_factory)
|
||||
else:
|
||||
raise TypeError("Parameter kms_client_factory must be a callable")
|
||||
|
||||
cdef init(self, callable_client_factory):
|
||||
cdef:
|
||||
CPyKmsClientFactoryVtable vtable
|
||||
shared_ptr[CPyKmsClientFactory] kms_client_factory
|
||||
|
||||
vtable.create_kms_client = _cb_create_kms_client
|
||||
kms_client_factory.reset(
|
||||
new CPyKmsClientFactory(callable_client_factory, vtable))
|
||||
# A KmsClientFactory object must be registered
|
||||
# via this method before calling any of
|
||||
# file_encryption_properties()/file_decryption_properties() methods.
|
||||
self.factory.get().RegisterKmsClientFactory(
|
||||
static_pointer_cast[CKmsClientFactory, CPyKmsClientFactory](
|
||||
kms_client_factory))
|
||||
|
||||
def file_encryption_properties(self,
|
||||
KmsConnectionConfig kms_connection_config,
|
||||
EncryptionConfiguration encryption_config):
|
||||
"""Create file encryption properties.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kms_connection_config : KmsConnectionConfig
|
||||
Configuration of connection to KMS
|
||||
|
||||
encryption_config : EncryptionConfiguration
|
||||
Configuration of the encryption, such as which columns to encrypt
|
||||
|
||||
Returns
|
||||
-------
|
||||
file_encryption_properties : FileEncryptionProperties
|
||||
File encryption properties.
|
||||
"""
|
||||
cdef:
|
||||
CResult[shared_ptr[CFileEncryptionProperties]] \
|
||||
file_encryption_properties_result
|
||||
with nogil:
|
||||
file_encryption_properties_result = \
|
||||
self.factory.get().SafeGetFileEncryptionProperties(
|
||||
deref(kms_connection_config.unwrap().get()),
|
||||
deref(encryption_config.unwrap().get()))
|
||||
file_encryption_properties = GetResultValue(
|
||||
file_encryption_properties_result)
|
||||
return FileEncryptionProperties.wrap(file_encryption_properties)
|
||||
|
||||
def file_decryption_properties(
|
||||
self,
|
||||
KmsConnectionConfig kms_connection_config,
|
||||
DecryptionConfiguration decryption_config=None):
|
||||
"""Create file decryption properties.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kms_connection_config : KmsConnectionConfig
|
||||
Configuration of connection to KMS
|
||||
|
||||
decryption_config : DecryptionConfiguration, default None
|
||||
Configuration of the decryption, such as cache timeout.
|
||||
Can be None.
|
||||
|
||||
Returns
|
||||
-------
|
||||
file_decryption_properties : FileDecryptionProperties
|
||||
File decryption properties.
|
||||
"""
|
||||
cdef:
|
||||
CDecryptionConfiguration c_decryption_config
|
||||
CResult[shared_ptr[CFileDecryptionProperties]] \
|
||||
c_file_decryption_properties
|
||||
if decryption_config is None:
|
||||
c_decryption_config = CDecryptionConfiguration()
|
||||
else:
|
||||
c_decryption_config = deref(decryption_config.unwrap().get())
|
||||
with nogil:
|
||||
c_file_decryption_properties = \
|
||||
self.factory.get().SafeGetFileDecryptionProperties(
|
||||
deref(kms_connection_config.unwrap().get()),
|
||||
c_decryption_config)
|
||||
file_decryption_properties = GetResultValue(
|
||||
c_file_decryption_properties)
|
||||
return FileDecryptionProperties.wrap(file_decryption_properties)
|
||||
|
||||
def remove_cache_entries_for_token(self, access_token):
|
||||
self.factory.get().RemoveCacheEntriesForToken(tobytes(access_token))
|
||||
|
||||
def remove_cache_entries_for_all_tokens(self):
|
||||
self.factory.get().RemoveCacheEntriesForAllTokens()
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_plasma.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_plasma.cpython-39-darwin.so
Executable file
Binary file not shown.
895
venv/lib/python3.9/site-packages/pyarrow/_plasma.pyx
Normal file
895
venv/lib/python3.9/site-packages/pyarrow/_plasma.pyx
Normal file
@@ -0,0 +1,895 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from libcpp cimport bool as c_bool, nullptr
|
||||
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
from libcpp.unordered_map cimport unordered_map
|
||||
from libc.stdint cimport int64_t, uint8_t, uintptr_t
|
||||
from cython.operator cimport dereference as deref, preincrement as inc
|
||||
from cpython.pycapsule cimport *
|
||||
|
||||
from collections.abc import Sequence
|
||||
import random
|
||||
import socket
|
||||
import warnings
|
||||
|
||||
import pyarrow
|
||||
from pyarrow.lib cimport (Buffer, NativeFile, _Weakrefable,
|
||||
check_status, pyarrow_wrap_buffer)
|
||||
from pyarrow.lib import ArrowException, frombytes
|
||||
from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer,
|
||||
CFixedSizeBufferWriter, CStatus)
|
||||
from pyarrow.includes.libplasma cimport *
|
||||
|
||||
PLASMA_WAIT_TIMEOUT = 2 ** 30
|
||||
|
||||
|
||||
cdef extern from "plasma/common.h" nogil:
|
||||
cdef cppclass CCudaIpcPlaceholder" plasma::internal::CudaIpcPlaceholder":
|
||||
pass
|
||||
|
||||
cdef cppclass CUniqueID" plasma::UniqueID":
|
||||
|
||||
@staticmethod
|
||||
CUniqueID from_binary(const c_string& binary)
|
||||
|
||||
@staticmethod
|
||||
CUniqueID from_random()
|
||||
|
||||
c_bool operator==(const CUniqueID& rhs) const
|
||||
|
||||
c_string hex() const
|
||||
|
||||
c_string binary() const
|
||||
|
||||
@staticmethod
|
||||
int64_t size()
|
||||
|
||||
cdef enum CObjectState" plasma::ObjectState":
|
||||
PLASMA_CREATED" plasma::ObjectState::PLASMA_CREATED"
|
||||
PLASMA_SEALED" plasma::ObjectState::PLASMA_SEALED"
|
||||
|
||||
cdef struct CObjectTableEntry" plasma::ObjectTableEntry":
|
||||
int fd
|
||||
int device_num
|
||||
int64_t map_size
|
||||
ptrdiff_t offset
|
||||
uint8_t* pointer
|
||||
int64_t data_size
|
||||
int64_t metadata_size
|
||||
int ref_count
|
||||
int64_t create_time
|
||||
int64_t construct_duration
|
||||
CObjectState state
|
||||
shared_ptr[CCudaIpcPlaceholder] ipc_handle
|
||||
|
||||
ctypedef unordered_map[CUniqueID, unique_ptr[CObjectTableEntry]] \
|
||||
CObjectTable" plasma::ObjectTable"
|
||||
|
||||
|
||||
cdef extern from "plasma/common.h":
|
||||
cdef int64_t kDigestSize" plasma::kDigestSize"
|
||||
|
||||
cdef extern from "plasma/client.h" nogil:
|
||||
|
||||
cdef cppclass CPlasmaClient" plasma::PlasmaClient":
|
||||
|
||||
CPlasmaClient()
|
||||
|
||||
CStatus Connect(const c_string& store_socket_name,
|
||||
const c_string& manager_socket_name,
|
||||
int release_delay, int num_retries)
|
||||
|
||||
CStatus Create(const CUniqueID& object_id,
|
||||
int64_t data_size, const uint8_t* metadata, int64_t
|
||||
metadata_size, const shared_ptr[CBuffer]* data)
|
||||
|
||||
CStatus CreateAndSeal(const CUniqueID& object_id,
|
||||
const c_string& data, const c_string& metadata)
|
||||
|
||||
CStatus Get(const c_vector[CUniqueID] object_ids, int64_t timeout_ms,
|
||||
c_vector[CObjectBuffer]* object_buffers)
|
||||
|
||||
CStatus Seal(const CUniqueID& object_id)
|
||||
|
||||
CStatus Evict(int64_t num_bytes, int64_t& num_bytes_evicted)
|
||||
|
||||
CStatus Hash(const CUniqueID& object_id, uint8_t* digest)
|
||||
|
||||
CStatus Release(const CUniqueID& object_id)
|
||||
|
||||
CStatus Contains(const CUniqueID& object_id, c_bool* has_object)
|
||||
|
||||
CStatus List(CObjectTable* objects)
|
||||
|
||||
CStatus Subscribe(int* fd)
|
||||
|
||||
CStatus DecodeNotifications(const uint8_t* buffer,
|
||||
c_vector[CUniqueID]* object_ids,
|
||||
c_vector[int64_t]* data_sizes,
|
||||
c_vector[int64_t]* metadata_sizes)
|
||||
|
||||
CStatus GetNotification(int fd, CUniqueID* object_id,
|
||||
int64_t* data_size, int64_t* metadata_size)
|
||||
|
||||
CStatus Disconnect()
|
||||
|
||||
CStatus Delete(const c_vector[CUniqueID] object_ids)
|
||||
|
||||
CStatus SetClientOptions(const c_string& client_name,
|
||||
int64_t limit_output_memory)
|
||||
|
||||
c_string DebugString()
|
||||
|
||||
int64_t store_capacity()
|
||||
|
||||
cdef extern from "plasma/client.h" nogil:
|
||||
|
||||
cdef struct CObjectBuffer" plasma::ObjectBuffer":
|
||||
shared_ptr[CBuffer] data
|
||||
shared_ptr[CBuffer] metadata
|
||||
|
||||
|
||||
def make_object_id(object_id):
|
||||
return ObjectID(object_id)
|
||||
|
||||
|
||||
cdef class ObjectID(_Weakrefable):
|
||||
"""
|
||||
DEPRECATED: An ObjectID represents a string of bytes used to identify Plasma objects.
|
||||
|
||||
.. deprecated:: 10.0.0
|
||||
Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CUniqueID data
|
||||
|
||||
def __cinit__(self, object_id):
|
||||
if (not isinstance(object_id, bytes) or
|
||||
len(object_id) != CUniqueID.size()):
|
||||
raise ValueError("Object ID must by 20 bytes,"
|
||||
" is " + str(object_id))
|
||||
self.data = CUniqueID.from_binary(object_id)
|
||||
|
||||
warnings.warn(
|
||||
"Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
return self.data == (<ObjectID?>other).data
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.data.binary())
|
||||
|
||||
def __repr__(self):
|
||||
return "ObjectID(" + self.data.hex().decode() + ")"
|
||||
|
||||
def __reduce__(self):
|
||||
return (make_object_id, (self.data.binary(),))
|
||||
|
||||
def binary(self):
|
||||
"""
|
||||
Return the binary representation of this ObjectID.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes
|
||||
Binary representation of the ObjectID.
|
||||
"""
|
||||
return self.data.binary()
|
||||
|
||||
@staticmethod
|
||||
def from_random():
|
||||
"""
|
||||
Returns a randomly generated ObjectID.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ObjectID
|
||||
A randomly generated ObjectID.
|
||||
"""
|
||||
random_id = bytes(bytearray(
|
||||
random.getrandbits(8) for _ in range(CUniqueID.size())))
|
||||
return ObjectID(random_id)
|
||||
|
||||
|
||||
cdef class ObjectNotAvailable(_Weakrefable):
|
||||
"""
|
||||
Placeholder for an object that was not available within the given timeout.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
cdef class PlasmaBuffer(Buffer):
|
||||
"""
|
||||
DEPRECATED: This is the type returned by calls to get with a PlasmaClient.
|
||||
|
||||
We define our own class instead of directly returning a buffer object so
|
||||
that we can add a custom destructor which notifies Plasma that the object
|
||||
is no longer being used, so the memory in the Plasma store backing the
|
||||
object can potentially be freed.
|
||||
|
||||
.. deprecated:: 10.0.0
|
||||
Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
object_id : ObjectID
|
||||
The ID of the object in the buffer.
|
||||
client : PlasmaClient
|
||||
The PlasmaClient that we use to communicate with the store and manager.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
ObjectID object_id
|
||||
PlasmaClient client
|
||||
|
||||
@staticmethod
|
||||
cdef PlasmaBuffer create(ObjectID object_id, PlasmaClient client,
|
||||
const shared_ptr[CBuffer]& buffer):
|
||||
cdef PlasmaBuffer self = PlasmaBuffer.__new__(PlasmaBuffer)
|
||||
self.object_id = object_id
|
||||
self.client = client
|
||||
self.init(buffer)
|
||||
return self
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call PlasmaBuffer's constructor directly, use "
|
||||
"`PlasmaClient.create` instead.")
|
||||
|
||||
def __dealloc__(self):
|
||||
"""
|
||||
Notify Plasma that the object is no longer needed.
|
||||
|
||||
If the plasma client has been shut down, then don't do anything.
|
||||
"""
|
||||
self.client._release(self.object_id)
|
||||
|
||||
|
||||
class PlasmaObjectNotFound(ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class PlasmaStoreFull(ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class PlasmaObjectExists(ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
cdef int plasma_check_status(const CStatus& status) nogil except -1:
|
||||
if status.ok():
|
||||
return 0
|
||||
|
||||
with gil:
|
||||
message = frombytes(status.message())
|
||||
if IsPlasmaObjectExists(status):
|
||||
raise PlasmaObjectExists(message)
|
||||
elif IsPlasmaObjectNotFound(status):
|
||||
raise PlasmaObjectNotFound(message)
|
||||
elif IsPlasmaStoreFull(status):
|
||||
raise PlasmaStoreFull(message)
|
||||
|
||||
return check_status(status)
|
||||
|
||||
|
||||
def get_socket_from_fd(fileno, family, type):
|
||||
import socket
|
||||
return socket.socket(fileno=fileno, family=family, type=type)
|
||||
|
||||
|
||||
cdef class PlasmaClient(_Weakrefable):
|
||||
"""
|
||||
DEPRECATED: The PlasmaClient is used to interface with a plasma store and manager.
|
||||
|
||||
The PlasmaClient can ask the PlasmaStore to allocate a new buffer, seal a
|
||||
buffer, and get a buffer. Buffers are referred to by object IDs, which are
|
||||
strings.
|
||||
|
||||
.. deprecated:: 10.0.0
|
||||
Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
shared_ptr[CPlasmaClient] client
|
||||
int notification_fd
|
||||
c_string store_socket_name
|
||||
|
||||
def __cinit__(self):
|
||||
self.client.reset(new CPlasmaClient())
|
||||
self.notification_fd = -1
|
||||
self.store_socket_name = b""
|
||||
|
||||
warnings.warn(
|
||||
"Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.",
|
||||
DeprecationWarning, stacklevel=3)
|
||||
|
||||
cdef _get_object_buffers(self, object_ids, int64_t timeout_ms,
|
||||
c_vector[CObjectBuffer]* result):
|
||||
cdef:
|
||||
c_vector[CUniqueID] ids
|
||||
ObjectID object_id
|
||||
|
||||
for object_id in object_ids:
|
||||
ids.push_back(object_id.data)
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Get(ids, timeout_ms, result))
|
||||
|
||||
# XXX C++ API should instead expose some kind of CreateAuto()
|
||||
cdef _make_mutable_plasma_buffer(self, ObjectID object_id, uint8_t* data,
|
||||
int64_t size):
|
||||
cdef shared_ptr[CBuffer] buffer
|
||||
buffer.reset(new CMutableBuffer(data, size))
|
||||
return PlasmaBuffer.create(object_id, self, buffer)
|
||||
|
||||
@property
|
||||
def store_socket_name(self):
|
||||
return self.store_socket_name.decode()
|
||||
|
||||
def create(self, ObjectID object_id, int64_t data_size,
|
||||
c_string metadata=b""):
|
||||
"""
|
||||
Create a new buffer in the PlasmaStore for a particular object ID.
|
||||
|
||||
The returned buffer is mutable until ``seal()`` is called.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
The object ID used to identify an object.
|
||||
data_size : int
|
||||
The size in bytes of the created buffer.
|
||||
metadata : bytes
|
||||
An optional string of bytes encoding whatever metadata the user
|
||||
wishes to encode.
|
||||
|
||||
Returns
|
||||
-------
|
||||
buffer : Buffer
|
||||
A mutable buffer where to write the object data.
|
||||
|
||||
Raises
|
||||
------
|
||||
PlasmaObjectExists
|
||||
This exception is raised if the object could not be created because
|
||||
there already is an object with the same ID in the plasma store.
|
||||
|
||||
PlasmaStoreFull
|
||||
This exception is raised if the object could
|
||||
not be created because the plasma store is unable to evict
|
||||
enough objects to create room for it.
|
||||
"""
|
||||
cdef shared_ptr[CBuffer] data
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
self.client.get().Create(object_id.data, data_size,
|
||||
<uint8_t*>(metadata.data()),
|
||||
metadata.size(), &data))
|
||||
return self._make_mutable_plasma_buffer(object_id,
|
||||
data.get().mutable_data(),
|
||||
data_size)
|
||||
|
||||
def create_and_seal(self, ObjectID object_id, c_string data,
|
||||
c_string metadata=b""):
|
||||
"""
|
||||
Store a new object in the PlasmaStore for a particular object ID.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
The object ID used to identify an object.
|
||||
data : bytes
|
||||
The object to store.
|
||||
metadata : bytes
|
||||
An optional string of bytes encoding whatever metadata the user
|
||||
wishes to encode.
|
||||
|
||||
Raises
|
||||
------
|
||||
PlasmaObjectExists
|
||||
This exception is raised if the object could not be created because
|
||||
there already is an object with the same ID in the plasma store.
|
||||
|
||||
PlasmaStoreFull: This exception is raised if the object could
|
||||
not be created because the plasma store is unable to evict
|
||||
enough objects to create room for it.
|
||||
"""
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
self.client.get().CreateAndSeal(object_id.data, data,
|
||||
metadata))
|
||||
|
||||
def get_buffers(self, object_ids, timeout_ms=-1, with_meta=False):
|
||||
"""
|
||||
Returns data buffer from the PlasmaStore based on object ID.
|
||||
|
||||
If the object has not been sealed yet, this call will block. The
|
||||
retrieved buffer is immutable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_ids : list
|
||||
A list of ObjectIDs used to identify some objects.
|
||||
timeout_ms : int
|
||||
The number of milliseconds that the get call should block before
|
||||
timing out and returning. Pass -1 if the call should block and 0
|
||||
if the call should return immediately.
|
||||
with_meta : bool
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
If with_meta=False, this is a list of PlasmaBuffers for the data
|
||||
associated with the object_ids and None if the object was not
|
||||
available. If with_meta=True, this is a list of tuples of
|
||||
PlasmaBuffer and metadata bytes.
|
||||
"""
|
||||
cdef c_vector[CObjectBuffer] object_buffers
|
||||
self._get_object_buffers(object_ids, timeout_ms, &object_buffers)
|
||||
result = []
|
||||
for i in range(object_buffers.size()):
|
||||
if object_buffers[i].data.get() != nullptr:
|
||||
data = pyarrow_wrap_buffer(object_buffers[i].data)
|
||||
else:
|
||||
data = None
|
||||
if not with_meta:
|
||||
result.append(data)
|
||||
else:
|
||||
if object_buffers[i].metadata.get() != nullptr:
|
||||
size = object_buffers[i].metadata.get().size()
|
||||
metadata = object_buffers[i].metadata.get().data()[:size]
|
||||
else:
|
||||
metadata = None
|
||||
result.append((metadata, data))
|
||||
return result
|
||||
|
||||
def get_metadata(self, object_ids, timeout_ms=-1):
|
||||
"""
|
||||
Returns metadata buffer from the PlasmaStore based on object ID.
|
||||
|
||||
If the object has not been sealed yet, this call will block. The
|
||||
retrieved buffer is immutable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_ids : list
|
||||
A list of ObjectIDs used to identify some objects.
|
||||
timeout_ms : int
|
||||
The number of milliseconds that the get call should block before
|
||||
timing out and returning. Pass -1 if the call should block and 0
|
||||
if the call should return immediately.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list
|
||||
List of PlasmaBuffers for the metadata associated with the
|
||||
object_ids and None if the object was not available.
|
||||
"""
|
||||
cdef c_vector[CObjectBuffer] object_buffers
|
||||
self._get_object_buffers(object_ids, timeout_ms, &object_buffers)
|
||||
result = []
|
||||
for i in range(object_buffers.size()):
|
||||
if object_buffers[i].metadata.get() != nullptr:
|
||||
result.append(pyarrow_wrap_buffer(object_buffers[i].metadata))
|
||||
else:
|
||||
result.append(None)
|
||||
return result
|
||||
|
||||
def put_raw_buffer(self, object value, ObjectID object_id=None,
|
||||
c_string metadata=b"", int memcopy_threads=6):
|
||||
"""
|
||||
Store Python buffer into the object store.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : Python object that implements the buffer protocol
|
||||
A Python buffer object to store.
|
||||
object_id : ObjectID, default None
|
||||
If this is provided, the specified object ID will be used to refer
|
||||
to the object.
|
||||
metadata : bytes
|
||||
An optional string of bytes encoding whatever metadata the user
|
||||
wishes to encode.
|
||||
memcopy_threads : int, default 6
|
||||
The number of threads to use to write the serialized object into
|
||||
the object store for large objects.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ObjectID
|
||||
The object ID associated to the Python buffer object.
|
||||
"""
|
||||
cdef ObjectID target_id = (object_id if object_id
|
||||
else ObjectID.from_random())
|
||||
cdef Buffer arrow_buffer = pyarrow.py_buffer(value)
|
||||
write_buffer = self.create(target_id, len(value), metadata)
|
||||
stream = pyarrow.FixedSizeBufferWriter(write_buffer)
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
stream.write(arrow_buffer)
|
||||
self.seal(target_id)
|
||||
return target_id
|
||||
|
||||
def put(self, object value, ObjectID object_id=None, int memcopy_threads=6,
|
||||
serialization_context=None):
|
||||
"""
|
||||
Store a Python value into the object store.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : object
|
||||
A Python object to store.
|
||||
object_id : ObjectID, default None
|
||||
If this is provided, the specified object ID will be used to refer
|
||||
to the object.
|
||||
memcopy_threads : int, default 6
|
||||
The number of threads to use to write the serialized object into
|
||||
the object store for large objects.
|
||||
serialization_context : pyarrow.SerializationContext, default None
|
||||
Custom serialization and deserialization context.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ObjectID
|
||||
The object ID associated to the Python object.
|
||||
"""
|
||||
cdef ObjectID target_id = (object_id if object_id
|
||||
else ObjectID.from_random())
|
||||
if serialization_context is not None:
|
||||
warnings.warn(
|
||||
"'serialization_context' is deprecated and will be removed "
|
||||
"in a future version.",
|
||||
FutureWarning, stacklevel=2
|
||||
)
|
||||
serialized = pyarrow.lib._serialize(value, serialization_context)
|
||||
buffer = self.create(target_id, serialized.total_bytes)
|
||||
stream = pyarrow.FixedSizeBufferWriter(buffer)
|
||||
stream.set_memcopy_threads(memcopy_threads)
|
||||
serialized.write_to(stream)
|
||||
self.seal(target_id)
|
||||
return target_id
|
||||
|
||||
def get(self, object_ids, int timeout_ms=-1, serialization_context=None):
|
||||
"""
|
||||
Get one or more Python values from the object store.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_ids : list or ObjectID
|
||||
Object ID or list of object IDs associated to the values we get
|
||||
from the store.
|
||||
timeout_ms : int, default -1
|
||||
The number of milliseconds that the get call should block before
|
||||
timing out and returning. Pass -1 if the call should block and 0
|
||||
if the call should return immediately.
|
||||
serialization_context : pyarrow.SerializationContext, default None
|
||||
Custom serialization and deserialization context.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list or object
|
||||
Python value or list of Python values for the data associated with
|
||||
the object_ids and ObjectNotAvailable if the object was not
|
||||
available.
|
||||
"""
|
||||
if serialization_context is not None:
|
||||
warnings.warn(
|
||||
"'serialization_context' is deprecated and will be removed "
|
||||
"in a future version.",
|
||||
FutureWarning, stacklevel=2
|
||||
)
|
||||
if isinstance(object_ids, Sequence):
|
||||
results = []
|
||||
buffers = self.get_buffers(object_ids, timeout_ms)
|
||||
for i in range(len(object_ids)):
|
||||
# buffers[i] is None if this object was not available within
|
||||
# the timeout
|
||||
if buffers[i]:
|
||||
val = pyarrow.lib._deserialize(buffers[i],
|
||||
serialization_context)
|
||||
results.append(val)
|
||||
else:
|
||||
results.append(ObjectNotAvailable)
|
||||
return results
|
||||
else:
|
||||
return self.get([object_ids], timeout_ms, serialization_context)[0]
|
||||
|
||||
def seal(self, ObjectID object_id):
|
||||
"""
|
||||
Seal the buffer in the PlasmaStore for a particular object ID.
|
||||
|
||||
Once a buffer has been sealed, the buffer is immutable and can only be
|
||||
accessed through get.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
A string used to identify an object.
|
||||
"""
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Seal(object_id.data))
|
||||
|
||||
def _release(self, ObjectID object_id):
|
||||
"""
|
||||
Notify Plasma that the object is no longer needed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
A string used to identify an object.
|
||||
"""
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Release(object_id.data))
|
||||
|
||||
def contains(self, ObjectID object_id):
|
||||
"""
|
||||
Check if the object is present and sealed in the PlasmaStore.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
A string used to identify an object.
|
||||
"""
|
||||
cdef c_bool is_contained
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Contains(object_id.data,
|
||||
&is_contained))
|
||||
return is_contained
|
||||
|
||||
def hash(self, ObjectID object_id):
|
||||
"""
|
||||
Compute the checksum of an object in the object store.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_id : ObjectID
|
||||
A string used to identify an object.
|
||||
|
||||
Returns
|
||||
-------
|
||||
bytes
|
||||
A digest string object's hash. If the object isn't in the object
|
||||
store, the string will have length zero.
|
||||
"""
|
||||
cdef c_vector[uint8_t] digest = c_vector[uint8_t](kDigestSize)
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Hash(object_id.data,
|
||||
digest.data()))
|
||||
return bytes(digest[:])
|
||||
|
||||
def evict(self, int64_t num_bytes):
|
||||
"""
|
||||
Evict some objects until to recover some bytes.
|
||||
|
||||
Recover at least num_bytes bytes if possible.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
num_bytes : int
|
||||
The number of bytes to attempt to recover.
|
||||
"""
|
||||
cdef int64_t num_bytes_evicted = -1
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
self.client.get().Evict(num_bytes, num_bytes_evicted))
|
||||
return num_bytes_evicted
|
||||
|
||||
def subscribe(self):
|
||||
"""Subscribe to notifications about sealed objects."""
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
self.client.get().Subscribe(&self.notification_fd))
|
||||
|
||||
def get_notification_socket(self):
|
||||
"""
|
||||
Get the notification socket.
|
||||
"""
|
||||
return get_socket_from_fd(self.notification_fd,
|
||||
family=socket.AF_UNIX,
|
||||
type=socket.SOCK_STREAM)
|
||||
|
||||
def decode_notifications(self, const uint8_t* buf):
|
||||
"""
|
||||
Get the notification from the buffer.
|
||||
|
||||
Returns
|
||||
-------
|
||||
[ObjectID]
|
||||
The list of object IDs in the notification message.
|
||||
c_vector[int64_t]
|
||||
The data sizes of the objects in the notification message.
|
||||
c_vector[int64_t]
|
||||
The metadata sizes of the objects in the notification message.
|
||||
"""
|
||||
cdef c_vector[CUniqueID] ids
|
||||
cdef c_vector[int64_t] data_sizes
|
||||
cdef c_vector[int64_t] metadata_sizes
|
||||
with nogil:
|
||||
status = self.client.get().DecodeNotifications(buf,
|
||||
&ids,
|
||||
&data_sizes,
|
||||
&metadata_sizes)
|
||||
plasma_check_status(status)
|
||||
object_ids = []
|
||||
for object_id in ids:
|
||||
object_ids.append(ObjectID(object_id.binary()))
|
||||
return object_ids, data_sizes, metadata_sizes
|
||||
|
||||
def get_next_notification(self):
|
||||
"""
|
||||
Get the next notification from the notification socket.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ObjectID
|
||||
The object ID of the object that was stored.
|
||||
int
|
||||
The data size of the object that was stored.
|
||||
int
|
||||
The metadata size of the object that was stored.
|
||||
"""
|
||||
cdef ObjectID object_id = ObjectID(CUniqueID.size() * b"\0")
|
||||
cdef int64_t data_size
|
||||
cdef int64_t metadata_size
|
||||
with nogil:
|
||||
status = self.client.get().GetNotification(self.notification_fd,
|
||||
&object_id.data,
|
||||
&data_size,
|
||||
&metadata_size)
|
||||
plasma_check_status(status)
|
||||
return object_id, data_size, metadata_size
|
||||
|
||||
def to_capsule(self):
|
||||
return PyCapsule_New(<void *>self.client.get(), "plasma", NULL)
|
||||
|
||||
def disconnect(self):
|
||||
"""
|
||||
Disconnect this client from the Plasma store.
|
||||
"""
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Disconnect())
|
||||
|
||||
def delete(self, object_ids):
|
||||
"""
|
||||
Delete the objects with the given IDs from other object store.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
object_ids : list
|
||||
A list of strings used to identify the objects.
|
||||
"""
|
||||
cdef c_vector[CUniqueID] ids
|
||||
cdef ObjectID object_id
|
||||
for object_id in object_ids:
|
||||
ids.push_back(object_id.data)
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().Delete(ids))
|
||||
|
||||
def set_client_options(self, client_name, int64_t limit_output_memory):
|
||||
cdef c_string name
|
||||
name = client_name.encode()
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
self.client.get().SetClientOptions(name, limit_output_memory))
|
||||
|
||||
def debug_string(self):
|
||||
cdef c_string result
|
||||
with nogil:
|
||||
result = self.client.get().DebugString()
|
||||
return result.decode()
|
||||
|
||||
def list(self):
|
||||
"""
|
||||
Experimental: List the objects in the store.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
Dictionary from ObjectIDs to an "info" dictionary describing the
|
||||
object. The "info" dictionary has the following entries:
|
||||
|
||||
data_size
|
||||
size of the object in bytes
|
||||
|
||||
metadata_size
|
||||
size of the object metadata in bytes
|
||||
|
||||
ref_count
|
||||
Number of clients referencing the object buffer
|
||||
|
||||
create_time
|
||||
Unix timestamp of the creation of the object
|
||||
|
||||
construct_duration
|
||||
Time the creation of the object took in seconds
|
||||
|
||||
state
|
||||
"created" if the object is still being created and
|
||||
"sealed" if it is already sealed
|
||||
"""
|
||||
cdef CObjectTable objects
|
||||
with nogil:
|
||||
plasma_check_status(self.client.get().List(&objects))
|
||||
result = dict()
|
||||
cdef ObjectID object_id
|
||||
cdef CObjectTableEntry entry
|
||||
it = objects.begin()
|
||||
while it != objects.end():
|
||||
object_id = ObjectID(deref(it).first.binary())
|
||||
entry = deref(deref(it).second)
|
||||
if entry.state == CObjectState.PLASMA_CREATED:
|
||||
state = "created"
|
||||
else:
|
||||
state = "sealed"
|
||||
result[object_id] = {
|
||||
"data_size": entry.data_size,
|
||||
"metadata_size": entry.metadata_size,
|
||||
"ref_count": entry.ref_count,
|
||||
"create_time": entry.create_time,
|
||||
"construct_duration": entry.construct_duration,
|
||||
"state": state
|
||||
}
|
||||
inc(it)
|
||||
return result
|
||||
|
||||
def store_capacity(self):
|
||||
"""
|
||||
Get the memory capacity of the store.
|
||||
|
||||
Returns
|
||||
-------
|
||||
|
||||
int
|
||||
The memory capacity of the store in bytes.
|
||||
"""
|
||||
return self.client.get().store_capacity()
|
||||
|
||||
|
||||
def connect(store_socket_name, int num_retries=-1):
|
||||
"""
|
||||
DEPRECATED: Return a new PlasmaClient that is connected a plasma store and
|
||||
optionally a manager.
|
||||
|
||||
.. deprecated:: 10.0.0
|
||||
Plasma is deprecated since Arrow 10.0.0. It will be removed in 12.0.0 or so.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_socket_name : str
|
||||
Name of the socket the plasma store is listening at.
|
||||
num_retries : int, default -1
|
||||
Number of times to try to connect to plasma store. Default value of -1
|
||||
uses the default (50)
|
||||
"""
|
||||
cdef PlasmaClient result = PlasmaClient()
|
||||
cdef int deprecated_release_delay = 0
|
||||
result.store_socket_name = store_socket_name.encode()
|
||||
with nogil:
|
||||
plasma_check_status(
|
||||
result.client.get().Connect(result.store_socket_name, b"",
|
||||
deprecated_release_delay, num_retries))
|
||||
return result
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_pyarrow_cpp_tests.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_pyarrow_cpp_tests.cpython-39-darwin.so
Executable file
Binary file not shown.
@@ -0,0 +1,33 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport CStatus
|
||||
|
||||
|
||||
ctypedef CStatus cb_test_func()
|
||||
|
||||
cdef extern from "arrow/python/python_test.h" namespace "arrow::py::testing" nogil:
|
||||
|
||||
cdef cppclass CTestCase "arrow::py::testing::TestCase":
|
||||
c_string name
|
||||
cb_test_func func
|
||||
|
||||
vector[CTestCase] GetCppTestCases()
|
||||
@@ -0,0 +1,62 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False, binding=True
|
||||
# distutils: language = c++
|
||||
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport check_status
|
||||
|
||||
from pyarrow.lib import frombytes
|
||||
|
||||
|
||||
cdef class CppTestCase:
|
||||
"""
|
||||
A simple wrapper for a C++ test case.
|
||||
"""
|
||||
cdef:
|
||||
CTestCase c_case
|
||||
|
||||
@staticmethod
|
||||
cdef wrap(CTestCase c_case):
|
||||
cdef:
|
||||
CppTestCase obj
|
||||
obj = CppTestCase.__new__(CppTestCase)
|
||||
obj.c_case = c_case
|
||||
return obj
|
||||
|
||||
@property
|
||||
def name(self):
|
||||
return frombytes(self.c_case.name)
|
||||
|
||||
def __repr__(self):
|
||||
return f"<{self.__class__.__name__} {self.name!r}>"
|
||||
|
||||
def __call__(self):
|
||||
check_status(self.c_case.func())
|
||||
|
||||
|
||||
def get_cpp_tests():
|
||||
"""
|
||||
Get a list of C++ test cases.
|
||||
"""
|
||||
cases = []
|
||||
c_cases = GetCppTestCases()
|
||||
for c_case in c_cases:
|
||||
cases.append(CppTestCase.wrap(c_case))
|
||||
return cases
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_s3fs.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_s3fs.cpython-39-darwin.so
Executable file
Binary file not shown.
420
venv/lib/python3.9/site-packages/pyarrow/_s3fs.pyx
Normal file
420
venv/lib/python3.9/site-packages/pyarrow/_s3fs.pyx
Normal file
@@ -0,0 +1,420 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
|
||||
from pyarrow.lib cimport (check_status, pyarrow_wrap_metadata,
|
||||
pyarrow_unwrap_metadata)
|
||||
from pyarrow.lib import frombytes, tobytes, KeyValueMetadata
|
||||
from pyarrow.includes.common cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_fs cimport *
|
||||
from pyarrow._fs cimport FileSystem
|
||||
|
||||
|
||||
cpdef enum S3LogLevel:
|
||||
Off = <int8_t> CS3LogLevel_Off
|
||||
Fatal = <int8_t> CS3LogLevel_Fatal
|
||||
Error = <int8_t> CS3LogLevel_Error
|
||||
Warn = <int8_t> CS3LogLevel_Warn
|
||||
Info = <int8_t> CS3LogLevel_Info
|
||||
Debug = <int8_t> CS3LogLevel_Debug
|
||||
Trace = <int8_t> CS3LogLevel_Trace
|
||||
|
||||
|
||||
def initialize_s3(S3LogLevel log_level=S3LogLevel.Fatal):
|
||||
"""
|
||||
Initialize S3 support
|
||||
|
||||
Parameters
|
||||
----------
|
||||
log_level : S3LogLevel
|
||||
level of logging
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fs.initialize_s3(fs.S3LogLevel.Error) # doctest: +SKIP
|
||||
"""
|
||||
cdef CS3GlobalOptions options
|
||||
options.log_level = <CS3LogLevel> log_level
|
||||
check_status(CInitializeS3(options))
|
||||
|
||||
|
||||
def finalize_s3():
|
||||
check_status(CFinalizeS3())
|
||||
|
||||
|
||||
def resolve_s3_region(bucket):
|
||||
"""
|
||||
Resolve the S3 region of a bucket.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bucket : str
|
||||
A S3 bucket name
|
||||
|
||||
Returns
|
||||
-------
|
||||
region : str
|
||||
A S3 region name
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fs.resolve_s3_region('voltrondata-labs-datasets')
|
||||
'us-east-2'
|
||||
"""
|
||||
cdef:
|
||||
c_string c_bucket
|
||||
c_string c_region
|
||||
|
||||
c_bucket = tobytes(bucket)
|
||||
with nogil:
|
||||
c_region = GetResultValue(ResolveS3BucketRegion(c_bucket))
|
||||
|
||||
return frombytes(c_region)
|
||||
|
||||
|
||||
class S3RetryStrategy:
|
||||
"""
|
||||
Base class for AWS retry strategies for use with S3.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_attempts : int, default 3
|
||||
The maximum number of retry attempts to attempt before failing.
|
||||
"""
|
||||
|
||||
def __init__(self, max_attempts=3):
|
||||
self.max_attempts = max_attempts
|
||||
|
||||
|
||||
class AwsStandardS3RetryStrategy(S3RetryStrategy):
|
||||
"""
|
||||
Represents an AWS Standard retry strategy for use with S3.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_attempts : int, default 3
|
||||
The maximum number of retry attempts to attempt before failing.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class AwsDefaultS3RetryStrategy(S3RetryStrategy):
|
||||
"""
|
||||
Represents an AWS Default retry strategy for use with S3.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
max_attempts : int, default 3
|
||||
The maximum number of retry attempts to attempt before failing.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
cdef class S3FileSystem(FileSystem):
|
||||
"""
|
||||
S3-backed FileSystem implementation
|
||||
|
||||
If neither access_key nor secret_key are provided, and role_arn is also not
|
||||
provided, then attempts to initialize from AWS environment variables,
|
||||
otherwise both access_key and secret_key must be provided.
|
||||
|
||||
If role_arn is provided instead of access_key and secret_key, temporary
|
||||
credentials will be fetched by issuing a request to STS to assume the
|
||||
specified role.
|
||||
|
||||
Note: S3 buckets are special and the operations available on them may be
|
||||
limited or more expensive than desired.
|
||||
|
||||
When S3FileSystem creates new buckets (assuming allow_bucket_creation is
|
||||
True), it does not pass any non-default settings. In AWS S3, the bucket and
|
||||
all objects will be not publicly visible, and will have no bucket policies
|
||||
and no resource tags. To have more control over how buckets are created,
|
||||
use a different API to create them.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
access_key : str, default None
|
||||
AWS Access Key ID. Pass None to use the standard AWS environment
|
||||
variables and/or configuration file.
|
||||
secret_key : str, default None
|
||||
AWS Secret Access key. Pass None to use the standard AWS environment
|
||||
variables and/or configuration file.
|
||||
session_token : str, default None
|
||||
AWS Session Token. An optional session token, required if access_key
|
||||
and secret_key are temporary credentials from STS.
|
||||
anonymous : boolean, default False
|
||||
Whether to connect anonymously if access_key and secret_key are None.
|
||||
If true, will not attempt to look up credentials using standard AWS
|
||||
configuration methods.
|
||||
role_arn : str, default None
|
||||
AWS Role ARN. If provided instead of access_key and secret_key,
|
||||
temporary credentials will be fetched by assuming this role.
|
||||
session_name : str, default None
|
||||
An optional identifier for the assumed role session.
|
||||
external_id : str, default None
|
||||
An optional unique identifier that might be required when you assume
|
||||
a role in another account.
|
||||
load_frequency : int, default 900
|
||||
The frequency (in seconds) with which temporary credentials from an
|
||||
assumed role session will be refreshed.
|
||||
region : str, default None
|
||||
AWS region to connect to. If not set, the AWS SDK will attempt to
|
||||
determine the region using heuristics such as environment variables,
|
||||
configuration profile, EC2 metadata, or default to 'us-east-1' when SDK
|
||||
version <1.8. One can also use :func:`pyarrow.fs.resolve_s3_region` to
|
||||
automatically resolve the region from a bucket name.
|
||||
request_timeout : double, default None
|
||||
Socket read timeouts on Windows and macOS, in seconds.
|
||||
If omitted, the AWS SDK default value is used (typically 3 seconds).
|
||||
This option is ignored on non-Windows, non-macOS systems.
|
||||
connect_timeout : double, default None
|
||||
Socket connection timeout, in seconds.
|
||||
If omitted, the AWS SDK default value is used (typically 1 second).
|
||||
scheme : str, default 'https'
|
||||
S3 connection transport scheme.
|
||||
endpoint_override : str, default None
|
||||
Override region with a connect string such as "localhost:9000"
|
||||
background_writes : boolean, default True
|
||||
Whether file writes will be issued in the background, without
|
||||
blocking.
|
||||
default_metadata : mapping or pyarrow.KeyValueMetadata, default None
|
||||
Default metadata for open_output_stream. This will be ignored if
|
||||
non-empty metadata is passed to open_output_stream.
|
||||
proxy_options : dict or str, default None
|
||||
If a proxy is used, provide the options here. Supported options are:
|
||||
'scheme' (str: 'http' or 'https'; required), 'host' (str; required),
|
||||
'port' (int; required), 'username' (str; optional),
|
||||
'password' (str; optional).
|
||||
A proxy URI (str) can also be provided, in which case these options
|
||||
will be derived from the provided URI.
|
||||
The following are equivalent::
|
||||
|
||||
S3FileSystem(proxy_options='http://username:password@localhost:8020')
|
||||
S3FileSystem(proxy_options={'scheme': 'http', 'host': 'localhost',
|
||||
'port': 8020, 'username': 'username',
|
||||
'password': 'password'})
|
||||
allow_bucket_creation : bool, default False
|
||||
Whether to allow CreateDir at the bucket-level. This option may also be
|
||||
passed in a URI query parameter.
|
||||
allow_bucket_deletion : bool, default False
|
||||
Whether to allow DeleteDir at the bucket-level. This option may also be
|
||||
passed in a URI query parameter.
|
||||
retry_strategy : S3RetryStrategy, default AwsStandardS3RetryStrategy(max_attempts=3)
|
||||
The retry strategy to use with S3; fail after max_attempts. Available
|
||||
strategies are AwsStandardS3RetryStrategy, AwsDefaultS3RetryStrategy.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from pyarrow import fs
|
||||
>>> s3 = fs.S3FileSystem(region='us-west-2')
|
||||
>>> s3.get_file_info(fs.FileSelector(
|
||||
... 'power-analysis-ready-datastore/power_901_constants.zarr/FROCEAN', recursive=True
|
||||
... ))
|
||||
[<FileInfo for 'power-analysis-ready-datastore/power_901_constants.zarr/FROCEAN/.zarray...
|
||||
|
||||
For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CS3FileSystem* s3fs
|
||||
|
||||
def __init__(self, *, access_key=None, secret_key=None, session_token=None,
|
||||
bint anonymous=False, region=None, request_timeout=None,
|
||||
connect_timeout=None, scheme=None, endpoint_override=None,
|
||||
bint background_writes=True, default_metadata=None,
|
||||
role_arn=None, session_name=None, external_id=None,
|
||||
load_frequency=900, proxy_options=None,
|
||||
allow_bucket_creation=False, allow_bucket_deletion=False,
|
||||
retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3)):
|
||||
cdef:
|
||||
CS3Options options
|
||||
shared_ptr[CS3FileSystem] wrapped
|
||||
|
||||
if access_key is not None and secret_key is None:
|
||||
raise ValueError(
|
||||
'In order to initialize with explicit credentials both '
|
||||
'access_key and secret_key must be provided, '
|
||||
'`secret_key` is not set.'
|
||||
)
|
||||
elif access_key is None and secret_key is not None:
|
||||
raise ValueError(
|
||||
'In order to initialize with explicit credentials both '
|
||||
'access_key and secret_key must be provided, '
|
||||
'`access_key` is not set.'
|
||||
)
|
||||
|
||||
elif session_token is not None and (access_key is None or
|
||||
secret_key is None):
|
||||
raise ValueError(
|
||||
'In order to initialize a session with temporary credentials, '
|
||||
'both secret_key and access_key must be provided in addition '
|
||||
'to session_token.'
|
||||
)
|
||||
|
||||
elif (access_key is not None or secret_key is not None):
|
||||
if anonymous:
|
||||
raise ValueError(
|
||||
'Cannot pass anonymous=True together with access_key '
|
||||
'and secret_key.')
|
||||
|
||||
if role_arn:
|
||||
raise ValueError(
|
||||
'Cannot provide role_arn with access_key and secret_key')
|
||||
|
||||
if session_token is None:
|
||||
session_token = ""
|
||||
|
||||
options = CS3Options.FromAccessKey(
|
||||
tobytes(access_key),
|
||||
tobytes(secret_key),
|
||||
tobytes(session_token)
|
||||
)
|
||||
elif anonymous:
|
||||
if role_arn:
|
||||
raise ValueError(
|
||||
'Cannot provide role_arn with anonymous=True')
|
||||
|
||||
options = CS3Options.Anonymous()
|
||||
elif role_arn:
|
||||
if session_name is None:
|
||||
session_name = ''
|
||||
if external_id is None:
|
||||
external_id = ''
|
||||
|
||||
options = CS3Options.FromAssumeRole(
|
||||
tobytes(role_arn),
|
||||
tobytes(session_name),
|
||||
tobytes(external_id),
|
||||
load_frequency
|
||||
)
|
||||
else:
|
||||
options = CS3Options.Defaults()
|
||||
|
||||
if region is not None:
|
||||
options.region = tobytes(region)
|
||||
if request_timeout is not None:
|
||||
options.request_timeout = request_timeout
|
||||
if connect_timeout is not None:
|
||||
options.connect_timeout = connect_timeout
|
||||
if scheme is not None:
|
||||
options.scheme = tobytes(scheme)
|
||||
if endpoint_override is not None:
|
||||
options.endpoint_override = tobytes(endpoint_override)
|
||||
if background_writes is not None:
|
||||
options.background_writes = background_writes
|
||||
if default_metadata is not None:
|
||||
if not isinstance(default_metadata, KeyValueMetadata):
|
||||
default_metadata = KeyValueMetadata(default_metadata)
|
||||
options.default_metadata = pyarrow_unwrap_metadata(
|
||||
default_metadata)
|
||||
|
||||
if proxy_options is not None:
|
||||
if isinstance(proxy_options, dict):
|
||||
options.proxy_options.scheme = tobytes(proxy_options["scheme"])
|
||||
options.proxy_options.host = tobytes(proxy_options["host"])
|
||||
options.proxy_options.port = proxy_options["port"]
|
||||
proxy_username = proxy_options.get("username", None)
|
||||
if proxy_username:
|
||||
options.proxy_options.username = tobytes(proxy_username)
|
||||
proxy_password = proxy_options.get("password", None)
|
||||
if proxy_password:
|
||||
options.proxy_options.password = tobytes(proxy_password)
|
||||
elif isinstance(proxy_options, str):
|
||||
options.proxy_options = GetResultValue(
|
||||
CS3ProxyOptions.FromUriString(tobytes(proxy_options)))
|
||||
else:
|
||||
raise TypeError(
|
||||
"'proxy_options': expected 'dict' or 'str', "
|
||||
f"got {type(proxy_options)} instead.")
|
||||
|
||||
options.allow_bucket_creation = allow_bucket_creation
|
||||
options.allow_bucket_deletion = allow_bucket_deletion
|
||||
|
||||
if isinstance(retry_strategy, AwsStandardS3RetryStrategy):
|
||||
options.retry_strategy = CS3RetryStrategy.GetAwsStandardRetryStrategy(
|
||||
retry_strategy.max_attempts)
|
||||
elif isinstance(retry_strategy, AwsDefaultS3RetryStrategy):
|
||||
options.retry_strategy = CS3RetryStrategy.GetAwsDefaultRetryStrategy(
|
||||
retry_strategy.max_attempts)
|
||||
else:
|
||||
raise ValueError(f'Invalid retry_strategy {retry_strategy!r}')
|
||||
|
||||
with nogil:
|
||||
wrapped = GetResultValue(CS3FileSystem.Make(options))
|
||||
|
||||
self.init(<shared_ptr[CFileSystem]> wrapped)
|
||||
|
||||
cdef init(self, const shared_ptr[CFileSystem]& wrapped):
|
||||
FileSystem.init(self, wrapped)
|
||||
self.s3fs = <CS3FileSystem*> wrapped.get()
|
||||
|
||||
@classmethod
|
||||
def _reconstruct(cls, kwargs):
|
||||
return cls(**kwargs)
|
||||
|
||||
def __reduce__(self):
|
||||
cdef CS3Options opts = self.s3fs.options()
|
||||
|
||||
# if creds were explicitly provided, then use them
|
||||
# else obtain them as they were last time.
|
||||
if opts.credentials_kind == CS3CredentialsKind_Explicit:
|
||||
access_key = frombytes(opts.GetAccessKey())
|
||||
secret_key = frombytes(opts.GetSecretKey())
|
||||
session_token = frombytes(opts.GetSessionToken())
|
||||
else:
|
||||
access_key = None
|
||||
secret_key = None
|
||||
session_token = None
|
||||
|
||||
return (
|
||||
S3FileSystem._reconstruct, (dict(
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
session_token=session_token,
|
||||
anonymous=(opts.credentials_kind ==
|
||||
CS3CredentialsKind_Anonymous),
|
||||
region=frombytes(opts.region),
|
||||
scheme=frombytes(opts.scheme),
|
||||
connect_timeout=opts.connect_timeout,
|
||||
request_timeout=opts.request_timeout,
|
||||
endpoint_override=frombytes(opts.endpoint_override),
|
||||
role_arn=frombytes(opts.role_arn),
|
||||
session_name=frombytes(opts.session_name),
|
||||
external_id=frombytes(opts.external_id),
|
||||
load_frequency=opts.load_frequency,
|
||||
background_writes=opts.background_writes,
|
||||
allow_bucket_creation=opts.allow_bucket_creation,
|
||||
allow_bucket_deletion=opts.allow_bucket_deletion,
|
||||
default_metadata=pyarrow_wrap_metadata(opts.default_metadata),
|
||||
proxy_options={'scheme': frombytes(opts.proxy_options.scheme),
|
||||
'host': frombytes(opts.proxy_options.host),
|
||||
'port': opts.proxy_options.port,
|
||||
'username': frombytes(
|
||||
opts.proxy_options.username),
|
||||
'password': frombytes(
|
||||
opts.proxy_options.password)},
|
||||
),)
|
||||
)
|
||||
|
||||
@property
|
||||
def region(self):
|
||||
"""
|
||||
The AWS region this filesystem connects to.
|
||||
"""
|
||||
return frombytes(self.s3fs.region())
|
||||
BIN
venv/lib/python3.9/site-packages/pyarrow/_substrait.cpython-39-darwin.so
Executable file
BIN
venv/lib/python3.9/site-packages/pyarrow/_substrait.cpython-39-darwin.so
Executable file
Binary file not shown.
207
venv/lib/python3.9/site-packages/pyarrow/_substrait.pyx
Normal file
207
venv/lib/python3.9/site-packages/pyarrow/_substrait.pyx
Normal file
@@ -0,0 +1,207 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: language_level = 3
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.vector cimport vector as std_vector
|
||||
|
||||
from pyarrow import Buffer, py_buffer
|
||||
from pyarrow.lib import frombytes, tobytes
|
||||
from pyarrow.lib cimport *
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.includes.libarrow_substrait cimport *
|
||||
|
||||
|
||||
cdef CDeclaration _create_named_table_provider(dict named_args, const std_vector[c_string]& names):
|
||||
cdef:
|
||||
c_string c_name
|
||||
shared_ptr[CTable] c_in_table
|
||||
shared_ptr[CTableSourceNodeOptions] c_tablesourceopts
|
||||
shared_ptr[CExecNodeOptions] c_input_node_opts
|
||||
vector[CDeclaration.Input] no_c_inputs
|
||||
|
||||
py_names = []
|
||||
for i in range(names.size()):
|
||||
c_name = names[i]
|
||||
py_names.append(frombytes(c_name))
|
||||
|
||||
py_table = named_args["provider"](py_names)
|
||||
c_in_table = pyarrow_unwrap_table(py_table)
|
||||
c_tablesourceopts = make_shared[CTableSourceNodeOptions](c_in_table)
|
||||
c_input_node_opts = static_pointer_cast[CExecNodeOptions, CTableSourceNodeOptions](
|
||||
c_tablesourceopts)
|
||||
return CDeclaration(tobytes("table_source"),
|
||||
no_c_inputs, c_input_node_opts)
|
||||
|
||||
|
||||
def run_query(plan, *, table_provider=None, use_threads=True):
|
||||
"""
|
||||
Execute a Substrait plan and read the results as a RecordBatchReader.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
plan : Union[Buffer, bytes]
|
||||
The serialized Substrait plan to execute.
|
||||
table_provider : object (optional)
|
||||
A function to resolve any NamedTable relation to a table.
|
||||
The function will receive a single argument which will be a list
|
||||
of strings representing the table name and should return a pyarrow.Table.
|
||||
use_threads : bool, default True
|
||||
If True then multiple threads will be used to run the query. If False then
|
||||
all CPU intensive work will be done on the calling thread.
|
||||
|
||||
Returns
|
||||
-------
|
||||
RecordBatchReader
|
||||
A reader containing the result of the executed query
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> from pyarrow.lib import tobytes
|
||||
>>> import pyarrow.substrait as substrait
|
||||
>>> test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]})
|
||||
>>> test_table_2 = pa.Table.from_pydict({"x": [4, 5, 6]})
|
||||
>>> def table_provider(names):
|
||||
... if not names:
|
||||
... raise Exception("No names provided")
|
||||
... elif names[0] == "t1":
|
||||
... return test_table_1
|
||||
... elif names[1] == "t2":
|
||||
... return test_table_2
|
||||
... else:
|
||||
... raise Exception("Unrecognized table name")
|
||||
...
|
||||
>>> substrait_query = '''
|
||||
... {
|
||||
... "relations": [
|
||||
... {"rel": {
|
||||
... "read": {
|
||||
... "base_schema": {
|
||||
... "struct": {
|
||||
... "types": [
|
||||
... {"i64": {}}
|
||||
... ]
|
||||
... },
|
||||
... "names": [
|
||||
... "x"
|
||||
... ]
|
||||
... },
|
||||
... "namedTable": {
|
||||
... "names": ["t1"]
|
||||
... }
|
||||
... }
|
||||
... }}
|
||||
... ]
|
||||
... }
|
||||
... '''
|
||||
>>> buf = pa._substrait._parse_json_plan(tobytes(substrait_query))
|
||||
>>> reader = pa.substrait.run_query(buf, table_provider)
|
||||
>>> reader.read_all()
|
||||
pyarrow.Table
|
||||
x: int64
|
||||
----
|
||||
x: [[1,2,3]]
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CResult[shared_ptr[CRecordBatchReader]] c_res_reader
|
||||
shared_ptr[CRecordBatchReader] c_reader
|
||||
RecordBatchReader reader
|
||||
c_string c_str_plan
|
||||
shared_ptr[CBuffer] c_buf_plan
|
||||
function[CNamedTableProvider] c_named_table_provider
|
||||
CConversionOptions c_conversion_options
|
||||
c_bool c_use_threads
|
||||
|
||||
c_use_threads = use_threads
|
||||
if isinstance(plan, bytes):
|
||||
c_buf_plan = pyarrow_unwrap_buffer(py_buffer(plan))
|
||||
elif isinstance(plan, Buffer):
|
||||
c_buf_plan = pyarrow_unwrap_buffer(plan)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Expected 'pyarrow.Buffer' or bytes, got '{type(plan)}'")
|
||||
|
||||
if table_provider is not None:
|
||||
named_table_args = {
|
||||
"provider": table_provider
|
||||
}
|
||||
c_conversion_options.named_table_provider = BindFunction[CNamedTableProvider](
|
||||
&_create_named_table_provider, named_table_args)
|
||||
|
||||
with nogil:
|
||||
c_res_reader = ExecuteSerializedPlan(
|
||||
deref(c_buf_plan), default_extension_id_registry(),
|
||||
GetFunctionRegistry(), c_conversion_options, c_use_threads)
|
||||
|
||||
c_reader = GetResultValue(c_res_reader)
|
||||
|
||||
reader = RecordBatchReader.__new__(RecordBatchReader)
|
||||
reader.reader = c_reader
|
||||
return reader
|
||||
|
||||
|
||||
def _parse_json_plan(plan):
|
||||
"""
|
||||
Parse a JSON plan into equivalent serialized Protobuf.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
plan: bytes
|
||||
Substrait plan in JSON.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Buffer
|
||||
A buffer containing the serialized Protobuf plan.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
CResult[shared_ptr[CBuffer]] c_res_buffer
|
||||
c_string c_str_plan
|
||||
shared_ptr[CBuffer] c_buf_plan
|
||||
|
||||
c_str_plan = plan
|
||||
c_res_buffer = SerializeJsonPlan(c_str_plan)
|
||||
with nogil:
|
||||
c_buf_plan = GetResultValue(c_res_buffer)
|
||||
return pyarrow_wrap_buffer(c_buf_plan)
|
||||
|
||||
|
||||
def get_supported_functions():
|
||||
"""
|
||||
Get a list of Substrait functions that the underlying
|
||||
engine currently supports.
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[str]
|
||||
A list of function ids encoded as '{uri}#{name}'
|
||||
"""
|
||||
|
||||
cdef:
|
||||
ExtensionIdRegistry* c_id_registry
|
||||
std_vector[c_string] c_ids
|
||||
|
||||
c_id_registry = default_extension_id_registry()
|
||||
c_ids = c_id_registry.GetSupportedSubstraitFunctions()
|
||||
|
||||
functions_list = []
|
||||
for c_id in c_ids:
|
||||
functions_list.append(frombytes(c_id))
|
||||
return functions_list
|
||||
3054
venv/lib/python3.9/site-packages/pyarrow/array.pxi
Normal file
3054
venv/lib/python3.9/site-packages/pyarrow/array.pxi
Normal file
File diff suppressed because it is too large
Load Diff
20
venv/lib/python3.9/site-packages/pyarrow/benchmark.pxi
Normal file
20
venv/lib/python3.9/site-packages/pyarrow/benchmark.pxi
Normal file
@@ -0,0 +1,20 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
def benchmark_PandasObjectIsNull(list obj):
|
||||
Benchmark_PandasObjectIsNull(obj)
|
||||
21
venv/lib/python3.9/site-packages/pyarrow/benchmark.py
Normal file
21
venv/lib/python3.9/site-packages/pyarrow/benchmark.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
|
||||
from pyarrow.lib import benchmark_PandasObjectIsNull
|
||||
82
venv/lib/python3.9/site-packages/pyarrow/builder.pxi
Normal file
82
venv/lib/python3.9/site-packages/pyarrow/builder.pxi
Normal file
@@ -0,0 +1,82 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
cdef class StringBuilder(_Weakrefable):
|
||||
"""
|
||||
Builder class for UTF8 strings.
|
||||
|
||||
This class exposes facilities for incrementally adding string values and
|
||||
building the null bitmap for a pyarrow.Array (type='string').
|
||||
"""
|
||||
cdef:
|
||||
unique_ptr[CStringBuilder] builder
|
||||
|
||||
def __cinit__(self, MemoryPool memory_pool=None):
|
||||
cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
|
||||
self.builder.reset(new CStringBuilder(pool))
|
||||
|
||||
def append(self, value):
|
||||
"""
|
||||
Append a single value to the builder.
|
||||
|
||||
The value can either be a string/bytes object or a null value
|
||||
(np.nan or None).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : string/bytes or np.nan/None
|
||||
The value to append to the string array builder.
|
||||
"""
|
||||
if value is None or value is np.nan:
|
||||
self.builder.get().AppendNull()
|
||||
elif isinstance(value, (bytes, str)):
|
||||
self.builder.get().Append(tobytes(value))
|
||||
else:
|
||||
raise TypeError('StringBuilder only accepts string objects')
|
||||
|
||||
def append_values(self, values):
|
||||
"""
|
||||
Append all the values from an iterable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : iterable of string/bytes or np.nan/None values
|
||||
The values to append to the string array builder.
|
||||
"""
|
||||
for value in values:
|
||||
self.append(value)
|
||||
|
||||
def finish(self):
|
||||
"""
|
||||
Return result of builder as an Array object; also resets the builder.
|
||||
|
||||
Returns
|
||||
-------
|
||||
array : pyarrow.Array
|
||||
"""
|
||||
cdef shared_ptr[CArray] out
|
||||
with nogil:
|
||||
self.builder.get().Finish(&out)
|
||||
return pyarrow_wrap_array(out)
|
||||
|
||||
@property
|
||||
def null_count(self):
|
||||
return self.builder.get().null_count()
|
||||
|
||||
def __len__(self):
|
||||
return self.builder.get().length()
|
||||
71
venv/lib/python3.9/site-packages/pyarrow/cffi.py
Normal file
71
venv/lib/python3.9/site-packages/pyarrow/cffi.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
|
||||
import cffi
|
||||
|
||||
c_source = """
|
||||
struct ArrowSchema {
|
||||
// Array type description
|
||||
const char* format;
|
||||
const char* name;
|
||||
const char* metadata;
|
||||
int64_t flags;
|
||||
int64_t n_children;
|
||||
struct ArrowSchema** children;
|
||||
struct ArrowSchema* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowSchema*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
struct ArrowArray {
|
||||
// Array data description
|
||||
int64_t length;
|
||||
int64_t null_count;
|
||||
int64_t offset;
|
||||
int64_t n_buffers;
|
||||
int64_t n_children;
|
||||
const void** buffers;
|
||||
struct ArrowArray** children;
|
||||
struct ArrowArray* dictionary;
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowArray*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
|
||||
struct ArrowArrayStream {
|
||||
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
|
||||
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
|
||||
|
||||
const char* (*get_last_error)(struct ArrowArrayStream*);
|
||||
|
||||
// Release callback
|
||||
void (*release)(struct ArrowArrayStream*);
|
||||
// Opaque producer-specific data
|
||||
void* private_data;
|
||||
};
|
||||
"""
|
||||
|
||||
# TODO use out-of-line mode for faster import and avoid C parsing
|
||||
ffi = cffi.FFI()
|
||||
ffi.cdef(c_source)
|
||||
77
venv/lib/python3.9/site-packages/pyarrow/compat.pxi
Normal file
77
venv/lib/python3.9/site-packages/pyarrow/compat.pxi
Normal file
@@ -0,0 +1,77 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
def encode_file_path(path):
|
||||
if isinstance(path, str):
|
||||
# POSIX systems can handle utf-8. UTF8 is converted to utf16-le in
|
||||
# libarrow
|
||||
encoded_path = path.encode('utf-8')
|
||||
else:
|
||||
encoded_path = path
|
||||
|
||||
# Windows file system requires utf-16le for file names; Arrow C++ libraries
|
||||
# will convert utf8 to utf16
|
||||
return encoded_path
|
||||
|
||||
|
||||
# Starting with Python 3.7, dicts are guaranteed to be insertion-ordered.
|
||||
ordered_dict = dict
|
||||
|
||||
|
||||
try:
|
||||
import pickle5 as builtin_pickle
|
||||
except ImportError:
|
||||
import pickle as builtin_pickle
|
||||
|
||||
|
||||
try:
|
||||
import cloudpickle as pickle
|
||||
except ImportError:
|
||||
pickle = builtin_pickle
|
||||
|
||||
|
||||
def tobytes(o):
|
||||
"""
|
||||
Encode a unicode or bytes string to bytes.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
o : str or bytes
|
||||
Input string.
|
||||
"""
|
||||
if isinstance(o, str):
|
||||
return o.encode('utf8')
|
||||
else:
|
||||
return o
|
||||
|
||||
|
||||
def frombytes(o, *, safe=False):
|
||||
"""
|
||||
Decode the given bytestring to unicode.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
o : bytes-like
|
||||
Input object.
|
||||
safe : bool, default False
|
||||
If true, raise on encoding errors.
|
||||
"""
|
||||
if safe:
|
||||
return o.decode('utf8', errors='replace')
|
||||
else:
|
||||
return o.decode('utf8')
|
||||
702
venv/lib/python3.9/site-packages/pyarrow/compute.py
Normal file
702
venv/lib/python3.9/site-packages/pyarrow/compute.py
Normal file
@@ -0,0 +1,702 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from pyarrow._compute import ( # noqa
|
||||
Function,
|
||||
FunctionOptions,
|
||||
FunctionRegistry,
|
||||
HashAggregateFunction,
|
||||
HashAggregateKernel,
|
||||
Kernel,
|
||||
ScalarAggregateFunction,
|
||||
ScalarAggregateKernel,
|
||||
ScalarFunction,
|
||||
ScalarKernel,
|
||||
VectorFunction,
|
||||
VectorKernel,
|
||||
# Option classes
|
||||
ArraySortOptions,
|
||||
AssumeTimezoneOptions,
|
||||
CastOptions,
|
||||
CountOptions,
|
||||
CumulativeSumOptions,
|
||||
DayOfWeekOptions,
|
||||
DictionaryEncodeOptions,
|
||||
ElementWiseAggregateOptions,
|
||||
ExtractRegexOptions,
|
||||
FilterOptions,
|
||||
IndexOptions,
|
||||
JoinOptions,
|
||||
ListSliceOptions,
|
||||
MakeStructOptions,
|
||||
MapLookupOptions,
|
||||
MatchSubstringOptions,
|
||||
ModeOptions,
|
||||
NullOptions,
|
||||
PadOptions,
|
||||
PartitionNthOptions,
|
||||
QuantileOptions,
|
||||
RandomOptions,
|
||||
RankOptions,
|
||||
ReplaceSliceOptions,
|
||||
ReplaceSubstringOptions,
|
||||
RoundOptions,
|
||||
RoundTemporalOptions,
|
||||
RoundToMultipleOptions,
|
||||
ScalarAggregateOptions,
|
||||
SelectKOptions,
|
||||
SetLookupOptions,
|
||||
SliceOptions,
|
||||
SortOptions,
|
||||
SplitOptions,
|
||||
SplitPatternOptions,
|
||||
StrftimeOptions,
|
||||
StrptimeOptions,
|
||||
StructFieldOptions,
|
||||
TakeOptions,
|
||||
TDigestOptions,
|
||||
TrimOptions,
|
||||
Utf8NormalizeOptions,
|
||||
VarianceOptions,
|
||||
WeekOptions,
|
||||
# Functions
|
||||
call_function,
|
||||
function_registry,
|
||||
get_function,
|
||||
list_functions,
|
||||
_group_by,
|
||||
# Udf
|
||||
register_scalar_function,
|
||||
ScalarUdfContext,
|
||||
# Expressions
|
||||
Expression,
|
||||
)
|
||||
|
||||
from collections import namedtuple
|
||||
import inspect
|
||||
from textwrap import dedent
|
||||
import warnings
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow import _compute_docstrings
|
||||
from pyarrow.vendored import docscrape
|
||||
|
||||
|
||||
def _get_arg_names(func):
|
||||
return func._doc.arg_names
|
||||
|
||||
|
||||
_OptionsClassDoc = namedtuple('_OptionsClassDoc', ('params',))
|
||||
|
||||
|
||||
def _scrape_options_class_doc(options_class):
|
||||
if not options_class.__doc__:
|
||||
return None
|
||||
doc = docscrape.NumpyDocString(options_class.__doc__)
|
||||
return _OptionsClassDoc(doc['Parameters'])
|
||||
|
||||
|
||||
def _decorate_compute_function(wrapper, exposed_name, func, options_class):
|
||||
# Decorate the given compute function wrapper with useful metadata
|
||||
# and documentation.
|
||||
cpp_doc = func._doc
|
||||
|
||||
wrapper.__arrow_compute_function__ = dict(
|
||||
name=func.name,
|
||||
arity=func.arity,
|
||||
options_class=cpp_doc.options_class,
|
||||
options_required=cpp_doc.options_required)
|
||||
wrapper.__name__ = exposed_name
|
||||
wrapper.__qualname__ = exposed_name
|
||||
|
||||
doc_pieces = []
|
||||
|
||||
# 1. One-line summary
|
||||
summary = cpp_doc.summary
|
||||
if not summary:
|
||||
arg_str = "arguments" if func.arity > 1 else "argument"
|
||||
summary = ("Call compute function {!r} with the given {}"
|
||||
.format(func.name, arg_str))
|
||||
|
||||
doc_pieces.append(f"{summary}.\n\n")
|
||||
|
||||
# 2. Multi-line description
|
||||
description = cpp_doc.description
|
||||
if description:
|
||||
doc_pieces.append(f"{description}\n\n")
|
||||
|
||||
doc_addition = _compute_docstrings.function_doc_additions.get(func.name)
|
||||
|
||||
# 3. Parameter description
|
||||
doc_pieces.append(dedent("""\
|
||||
Parameters
|
||||
----------
|
||||
"""))
|
||||
|
||||
# 3a. Compute function parameters
|
||||
arg_names = _get_arg_names(func)
|
||||
for arg_name in arg_names:
|
||||
if func.kind in ('vector', 'scalar_aggregate'):
|
||||
arg_type = 'Array-like'
|
||||
else:
|
||||
arg_type = 'Array-like or scalar-like'
|
||||
doc_pieces.append(f"{arg_name} : {arg_type}\n")
|
||||
doc_pieces.append(" Argument to compute function.\n")
|
||||
|
||||
# 3b. Compute function option values
|
||||
if options_class is not None:
|
||||
options_class_doc = _scrape_options_class_doc(options_class)
|
||||
if options_class_doc:
|
||||
for p in options_class_doc.params:
|
||||
doc_pieces.append(f"{p.name} : {p.type}\n")
|
||||
for s in p.desc:
|
||||
doc_pieces.append(f" {s}\n")
|
||||
else:
|
||||
warnings.warn(f"Options class {options_class.__name__} "
|
||||
f"does not have a docstring", RuntimeWarning)
|
||||
options_sig = inspect.signature(options_class)
|
||||
for p in options_sig.parameters.values():
|
||||
doc_pieces.append(dedent("""\
|
||||
{0} : optional
|
||||
Parameter for {1} constructor. Either `options`
|
||||
or `{0}` can be passed, but not both at the same time.
|
||||
""".format(p.name, options_class.__name__)))
|
||||
doc_pieces.append(dedent(f"""\
|
||||
options : pyarrow.compute.{options_class.__name__}, optional
|
||||
Alternative way of passing options.
|
||||
"""))
|
||||
|
||||
doc_pieces.append(dedent("""\
|
||||
memory_pool : pyarrow.MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
"""))
|
||||
|
||||
# 4. Custom addition (e.g. examples)
|
||||
if doc_addition is not None:
|
||||
doc_pieces.append("\n{}\n".format(dedent(doc_addition).strip("\n")))
|
||||
|
||||
wrapper.__doc__ = "".join(doc_pieces)
|
||||
return wrapper
|
||||
|
||||
|
||||
def _get_options_class(func):
|
||||
class_name = func._doc.options_class
|
||||
if not class_name:
|
||||
return None
|
||||
try:
|
||||
return globals()[class_name]
|
||||
except KeyError:
|
||||
warnings.warn("Python binding for {} not exposed"
|
||||
.format(class_name), RuntimeWarning)
|
||||
return None
|
||||
|
||||
|
||||
def _handle_options(name, options_class, options, args, kwargs):
|
||||
if args or kwargs:
|
||||
if options is not None:
|
||||
raise TypeError(
|
||||
"Function {!r} called with both an 'options' argument "
|
||||
"and additional arguments"
|
||||
.format(name))
|
||||
return options_class(*args, **kwargs)
|
||||
|
||||
if options is not None:
|
||||
if isinstance(options, dict):
|
||||
return options_class(**options)
|
||||
elif isinstance(options, options_class):
|
||||
return options
|
||||
raise TypeError(
|
||||
"Function {!r} expected a {} parameter, got {}"
|
||||
.format(name, options_class, type(options)))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _make_generic_wrapper(func_name, func, options_class, arity):
|
||||
if options_class is None:
|
||||
def wrapper(*args, memory_pool=None):
|
||||
if arity is not Ellipsis and len(args) != arity:
|
||||
raise TypeError(
|
||||
f"{func_name} takes {arity} positional argument(s), "
|
||||
f"but {len(args)} were given"
|
||||
)
|
||||
if args and isinstance(args[0], Expression):
|
||||
return Expression._call(func_name, list(args))
|
||||
return func.call(args, None, memory_pool)
|
||||
else:
|
||||
def wrapper(*args, memory_pool=None, options=None, **kwargs):
|
||||
if arity is not Ellipsis:
|
||||
if len(args) < arity:
|
||||
raise TypeError(
|
||||
f"{func_name} takes {arity} positional argument(s), "
|
||||
f"but {len(args)} were given"
|
||||
)
|
||||
option_args = args[arity:]
|
||||
args = args[:arity]
|
||||
else:
|
||||
option_args = ()
|
||||
options = _handle_options(func_name, options_class, options,
|
||||
option_args, kwargs)
|
||||
if args and isinstance(args[0], Expression):
|
||||
return Expression._call(func_name, list(args), options)
|
||||
return func.call(args, options, memory_pool)
|
||||
return wrapper
|
||||
|
||||
|
||||
def _make_signature(arg_names, var_arg_names, options_class):
|
||||
from inspect import Parameter
|
||||
params = []
|
||||
for name in arg_names:
|
||||
params.append(Parameter(name, Parameter.POSITIONAL_ONLY))
|
||||
for name in var_arg_names:
|
||||
params.append(Parameter(name, Parameter.VAR_POSITIONAL))
|
||||
if options_class is not None:
|
||||
options_sig = inspect.signature(options_class)
|
||||
for p in options_sig.parameters.values():
|
||||
assert p.kind in (Parameter.POSITIONAL_OR_KEYWORD,
|
||||
Parameter.KEYWORD_ONLY)
|
||||
if var_arg_names:
|
||||
# Cannot have a positional argument after a *args
|
||||
p = p.replace(kind=Parameter.KEYWORD_ONLY)
|
||||
params.append(p)
|
||||
params.append(Parameter("options", Parameter.KEYWORD_ONLY,
|
||||
default=None))
|
||||
params.append(Parameter("memory_pool", Parameter.KEYWORD_ONLY,
|
||||
default=None))
|
||||
return inspect.Signature(params)
|
||||
|
||||
|
||||
def _wrap_function(name, func):
|
||||
options_class = _get_options_class(func)
|
||||
arg_names = _get_arg_names(func)
|
||||
has_vararg = arg_names and arg_names[-1].startswith('*')
|
||||
if has_vararg:
|
||||
var_arg_names = [arg_names.pop().lstrip('*')]
|
||||
else:
|
||||
var_arg_names = []
|
||||
|
||||
wrapper = _make_generic_wrapper(
|
||||
name, func, options_class, arity=func.arity)
|
||||
wrapper.__signature__ = _make_signature(arg_names, var_arg_names,
|
||||
options_class)
|
||||
return _decorate_compute_function(wrapper, name, func, options_class)
|
||||
|
||||
|
||||
def _make_global_functions():
|
||||
"""
|
||||
Make global functions wrapping each compute function.
|
||||
|
||||
Note that some of the automatically-generated wrappers may be overridden
|
||||
by custom versions below.
|
||||
"""
|
||||
g = globals()
|
||||
reg = function_registry()
|
||||
|
||||
# Avoid clashes with Python keywords
|
||||
rewrites = {'and': 'and_',
|
||||
'or': 'or_'}
|
||||
|
||||
for cpp_name in reg.list_functions():
|
||||
name = rewrites.get(cpp_name, cpp_name)
|
||||
func = reg.get_function(cpp_name)
|
||||
if func.kind == "hash_aggregate":
|
||||
# Hash aggregate functions are not callable,
|
||||
# so let's not expose them at module level.
|
||||
continue
|
||||
assert name not in g, name
|
||||
g[cpp_name] = g[name] = _wrap_function(name, func)
|
||||
|
||||
|
||||
_make_global_functions()
|
||||
|
||||
|
||||
def cast(arr, target_type=None, safe=None, options=None):
|
||||
"""
|
||||
Cast array values to another data type. Can also be invoked as an array
|
||||
instance method.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
arr : Array-like
|
||||
target_type : DataType or str
|
||||
Type to cast to
|
||||
safe : bool, default True
|
||||
Check for overflows or other unsafe conversions
|
||||
options : CastOptions, default None
|
||||
Additional checks pass by CastOptions
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from datetime import datetime
|
||||
>>> import pyarrow as pa
|
||||
>>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)])
|
||||
>>> arr.type
|
||||
TimestampType(timestamp[us])
|
||||
|
||||
You can use ``pyarrow.DataType`` objects to specify the target type:
|
||||
|
||||
>>> cast(arr, pa.timestamp('ms'))
|
||||
<pyarrow.lib.TimestampArray object at ...>
|
||||
[
|
||||
2010-01-01 00:00:00.000,
|
||||
2015-01-01 00:00:00.000
|
||||
]
|
||||
|
||||
>>> cast(arr, pa.timestamp('ms')).type
|
||||
TimestampType(timestamp[ms])
|
||||
|
||||
Alternatively, it is also supported to use the string aliases for these
|
||||
types:
|
||||
|
||||
>>> arr.cast('timestamp[ms]')
|
||||
<pyarrow.lib.TimestampArray object at ...>
|
||||
[
|
||||
2010-01-01 00:00:00.000,
|
||||
2015-01-01 00:00:00.000
|
||||
]
|
||||
>>> arr.cast('timestamp[ms]').type
|
||||
TimestampType(timestamp[ms])
|
||||
|
||||
Returns
|
||||
-------
|
||||
casted : Array
|
||||
The cast result as a new Array
|
||||
"""
|
||||
safe_vars_passed = (safe is not None) or (target_type is not None)
|
||||
|
||||
if safe_vars_passed and (options is not None):
|
||||
raise ValueError("Must either pass values for 'target_type' and 'safe'"
|
||||
" or pass a value for 'options'")
|
||||
|
||||
if options is None:
|
||||
target_type = pa.types.lib.ensure_type(target_type)
|
||||
if safe is False:
|
||||
options = CastOptions.unsafe(target_type)
|
||||
else:
|
||||
options = CastOptions.safe(target_type)
|
||||
return call_function("cast", [arr], options)
|
||||
|
||||
|
||||
def index(data, value, start=None, end=None, *, memory_pool=None):
|
||||
"""
|
||||
Find the index of the first occurrence of a given value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Array-like
|
||||
value : Scalar-like object
|
||||
The value to search for.
|
||||
start : int, optional
|
||||
end : int, optional
|
||||
memory_pool : MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
|
||||
Returns
|
||||
-------
|
||||
index : int
|
||||
the index, or -1 if not found
|
||||
"""
|
||||
if start is not None:
|
||||
if end is not None:
|
||||
data = data.slice(start, end - start)
|
||||
else:
|
||||
data = data.slice(start)
|
||||
elif end is not None:
|
||||
data = data.slice(0, end)
|
||||
|
||||
if not isinstance(value, pa.Scalar):
|
||||
value = pa.scalar(value, type=data.type)
|
||||
elif data.type != value.type:
|
||||
value = pa.scalar(value.as_py(), type=data.type)
|
||||
options = IndexOptions(value=value)
|
||||
result = call_function('index', [data], options, memory_pool)
|
||||
if start is not None and result.as_py() >= 0:
|
||||
result = pa.scalar(result.as_py() + start, type=pa.int64())
|
||||
return result
|
||||
|
||||
|
||||
def take(data, indices, *, boundscheck=True, memory_pool=None):
|
||||
"""
|
||||
Select values (or records) from array- or table-like data given integer
|
||||
selection indices.
|
||||
|
||||
The result will be of the same type(s) as the input, with elements taken
|
||||
from the input array (or record batch / table fields) at the given
|
||||
indices. If an index is null then the corresponding value in the output
|
||||
will be null.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : Array, ChunkedArray, RecordBatch, or Table
|
||||
indices : Array, ChunkedArray
|
||||
Must be of integer type
|
||||
boundscheck : boolean, default True
|
||||
Whether to boundscheck the indices. If False and there is an out of
|
||||
bounds index, will likely cause the process to crash.
|
||||
memory_pool : MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : depends on inputs
|
||||
Selected values for the given indices
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
|
||||
>>> indices = pa.array([0, None, 4, 3])
|
||||
>>> arr.take(indices)
|
||||
<pyarrow.lib.StringArray object at ...>
|
||||
[
|
||||
"a",
|
||||
null,
|
||||
"e",
|
||||
null
|
||||
]
|
||||
"""
|
||||
options = TakeOptions(boundscheck=boundscheck)
|
||||
return call_function('take', [data, indices], options, memory_pool)
|
||||
|
||||
|
||||
def fill_null(values, fill_value):
|
||||
"""
|
||||
Replace each null element in values with fill_value. The fill_value must be
|
||||
the same type as values or able to be implicitly casted to the array's
|
||||
type.
|
||||
|
||||
This is an alias for :func:`coalesce`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : Array, ChunkedArray, or Scalar-like object
|
||||
Each null element is replaced with the corresponding value
|
||||
from fill_value.
|
||||
fill_value : Array, ChunkedArray, or Scalar-like object
|
||||
If not same type as data will attempt to cast.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : depends on inputs
|
||||
Values with all null elements replaced
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> arr = pa.array([1, 2, None, 3], type=pa.int8())
|
||||
>>> fill_value = pa.scalar(5, type=pa.int8())
|
||||
>>> arr.fill_null(fill_value)
|
||||
<pyarrow.lib.Int8Array object at ...>
|
||||
[
|
||||
1,
|
||||
2,
|
||||
5,
|
||||
3
|
||||
]
|
||||
"""
|
||||
if not isinstance(fill_value, (pa.Array, pa.ChunkedArray, pa.Scalar)):
|
||||
fill_value = pa.scalar(fill_value, type=values.type)
|
||||
elif values.type != fill_value.type:
|
||||
fill_value = pa.scalar(fill_value.as_py(), type=values.type)
|
||||
|
||||
return call_function("coalesce", [values, fill_value])
|
||||
|
||||
|
||||
def top_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
|
||||
"""
|
||||
Select the indices of the top-k ordered elements from array- or table-like
|
||||
data.
|
||||
|
||||
This is a specialization for :func:`select_k_unstable`. Output is not
|
||||
guaranteed to be stable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : Array, ChunkedArray, RecordBatch, or Table
|
||||
Data to sort and get top indices from.
|
||||
k : int
|
||||
The number of `k` elements to keep.
|
||||
sort_keys : List-like
|
||||
Column key names to order by when input is table-like data.
|
||||
memory_pool : MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : Array
|
||||
Indices of the top-k ordered elements
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
|
||||
>>> pc.top_k_unstable(arr, k=3)
|
||||
<pyarrow.lib.UInt64Array object at ...>
|
||||
[
|
||||
5,
|
||||
4,
|
||||
2
|
||||
]
|
||||
"""
|
||||
if sort_keys is None:
|
||||
sort_keys = []
|
||||
if isinstance(values, (pa.Array, pa.ChunkedArray)):
|
||||
sort_keys.append(("dummy", "descending"))
|
||||
else:
|
||||
sort_keys = map(lambda key_name: (key_name, "descending"), sort_keys)
|
||||
options = SelectKOptions(k, sort_keys)
|
||||
return call_function("select_k_unstable", [values], options, memory_pool)
|
||||
|
||||
|
||||
def bottom_k_unstable(values, k, sort_keys=None, *, memory_pool=None):
|
||||
"""
|
||||
Select the indices of the bottom-k ordered elements from
|
||||
array- or table-like data.
|
||||
|
||||
This is a specialization for :func:`select_k_unstable`. Output is not
|
||||
guaranteed to be stable.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
values : Array, ChunkedArray, RecordBatch, or Table
|
||||
Data to sort and get bottom indices from.
|
||||
k : int
|
||||
The number of `k` elements to keep.
|
||||
sort_keys : List-like
|
||||
Column key names to order by when input is table-like data.
|
||||
memory_pool : MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : Array of indices
|
||||
Indices of the bottom-k ordered elements
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow as pa
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> arr = pa.array(["a", "b", "c", None, "e", "f"])
|
||||
>>> pc.bottom_k_unstable(arr, k=3)
|
||||
<pyarrow.lib.UInt64Array object at ...>
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2
|
||||
]
|
||||
"""
|
||||
if sort_keys is None:
|
||||
sort_keys = []
|
||||
if isinstance(values, (pa.Array, pa.ChunkedArray)):
|
||||
sort_keys.append(("dummy", "ascending"))
|
||||
else:
|
||||
sort_keys = map(lambda key_name: (key_name, "ascending"), sort_keys)
|
||||
options = SelectKOptions(k, sort_keys)
|
||||
return call_function("select_k_unstable", [values], options, memory_pool)
|
||||
|
||||
|
||||
def random(n, *, initializer='system', options=None, memory_pool=None):
|
||||
"""
|
||||
Generate numbers in the range [0, 1).
|
||||
|
||||
Generated values are uniformly-distributed, double-precision
|
||||
in range [0, 1). Algorithm and seed can be changed via RandomOptions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : int
|
||||
Number of values to generate, must be greater than or equal to 0
|
||||
initializer : int or str
|
||||
How to initialize the underlying random generator.
|
||||
If an integer is given, it is used as a seed.
|
||||
If "system" is given, the random generator is initialized with
|
||||
a system-specific source of (hopefully true) randomness.
|
||||
Other values are invalid.
|
||||
options : pyarrow.compute.RandomOptions, optional
|
||||
Alternative way of passing options.
|
||||
memory_pool : pyarrow.MemoryPool, optional
|
||||
If not passed, will allocate memory from the default memory pool.
|
||||
"""
|
||||
options = RandomOptions(initializer=initializer)
|
||||
return call_function("random", [], options, memory_pool, length=n)
|
||||
|
||||
|
||||
def field(*name_or_index):
|
||||
"""Reference a column of the dataset.
|
||||
|
||||
Stores only the field's name. Type and other information is known only when
|
||||
the expression is bound to a dataset having an explicit scheme.
|
||||
|
||||
Nested references are allowed by passing multiple names or a tuple of
|
||||
names. For example ``('foo', 'bar')`` references the field named "bar"
|
||||
inside the field named "foo".
|
||||
|
||||
Parameters
|
||||
----------
|
||||
*name_or_index : string, multiple strings, tuple or int
|
||||
The name or index of the (possibly nested) field the expression
|
||||
references to.
|
||||
|
||||
Returns
|
||||
-------
|
||||
field_expr : Expression
|
||||
Reference to the given field
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import pyarrow.compute as pc
|
||||
>>> pc.field("a")
|
||||
<pyarrow.compute.Expression a>
|
||||
>>> pc.field(1)
|
||||
<pyarrow.compute.Expression FieldPath(1)>
|
||||
>>> pc.field(("a", "b"))
|
||||
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
|
||||
>>> pc.field("a", "b")
|
||||
<pyarrow.compute.Expression FieldRef.Nested(FieldRef.Name(a) ...
|
||||
"""
|
||||
n = len(name_or_index)
|
||||
if n == 1:
|
||||
if isinstance(name_or_index[0], (str, int)):
|
||||
return Expression._field(name_or_index[0])
|
||||
elif isinstance(name_or_index[0], tuple):
|
||||
return Expression._nested_field(name_or_index[0])
|
||||
else:
|
||||
raise TypeError(
|
||||
"field reference should be str, multiple str, tuple or "
|
||||
f"integer, got {type(name_or_index[0])}"
|
||||
)
|
||||
# In case of multiple strings not supplied in a tuple
|
||||
else:
|
||||
return Expression._nested_field(name_or_index)
|
||||
|
||||
|
||||
def scalar(value):
|
||||
"""Expression representing a scalar value.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
value : bool, int, float or string
|
||||
Python value of the scalar. Note that only a subset of types are
|
||||
currently supported.
|
||||
|
||||
Returns
|
||||
-------
|
||||
scalar_expr : Expression
|
||||
An Expression representing the scalar value
|
||||
"""
|
||||
return Expression._scalar(value)
|
||||
76
venv/lib/python3.9/site-packages/pyarrow/config.pxi
Normal file
76
venv/lib/python3.9/site-packages/pyarrow/config.pxi
Normal file
@@ -0,0 +1,76 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from pyarrow.includes.libarrow cimport GetBuildInfo
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
VersionInfo = namedtuple('VersionInfo', ('major', 'minor', 'patch'))
|
||||
|
||||
BuildInfo = namedtuple(
|
||||
'BuildInfo',
|
||||
('version', 'version_info', 'so_version', 'full_so_version',
|
||||
'compiler_id', 'compiler_version', 'compiler_flags',
|
||||
'git_id', 'git_description', 'package_kind', 'build_type'))
|
||||
|
||||
RuntimeInfo = namedtuple('RuntimeInfo',
|
||||
('simd_level', 'detected_simd_level'))
|
||||
|
||||
cdef _build_info():
|
||||
cdef:
|
||||
const CBuildInfo* c_info
|
||||
|
||||
c_info = &GetBuildInfo()
|
||||
|
||||
return BuildInfo(version=frombytes(c_info.version_string),
|
||||
version_info=VersionInfo(c_info.version_major,
|
||||
c_info.version_minor,
|
||||
c_info.version_patch),
|
||||
so_version=frombytes(c_info.so_version),
|
||||
full_so_version=frombytes(c_info.full_so_version),
|
||||
compiler_id=frombytes(c_info.compiler_id),
|
||||
compiler_version=frombytes(c_info.compiler_version),
|
||||
compiler_flags=frombytes(c_info.compiler_flags),
|
||||
git_id=frombytes(c_info.git_id),
|
||||
git_description=frombytes(c_info.git_description),
|
||||
package_kind=frombytes(c_info.package_kind),
|
||||
build_type=frombytes(c_info.build_type).lower(),
|
||||
)
|
||||
|
||||
|
||||
cpp_build_info = _build_info()
|
||||
cpp_version = cpp_build_info.version
|
||||
cpp_version_info = cpp_build_info.version_info
|
||||
|
||||
|
||||
def runtime_info():
|
||||
"""
|
||||
Get runtime information.
|
||||
|
||||
Returns
|
||||
-------
|
||||
info : pyarrow.RuntimeInfo
|
||||
"""
|
||||
cdef:
|
||||
CRuntimeInfo c_info
|
||||
|
||||
c_info = GetRuntimeInfo()
|
||||
|
||||
return RuntimeInfo(
|
||||
simd_level=frombytes(c_info.simd_level),
|
||||
detected_simd_level=frombytes(c_info.detected_simd_level))
|
||||
267
venv/lib/python3.9/site-packages/pyarrow/conftest.py
Normal file
267
venv/lib/python3.9/site-packages/pyarrow/conftest.py
Normal file
@@ -0,0 +1,267 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pytest
|
||||
from pyarrow import Codec
|
||||
from pyarrow import fs
|
||||
|
||||
groups = [
|
||||
'brotli',
|
||||
'bz2',
|
||||
'cython',
|
||||
'dataset',
|
||||
'hypothesis',
|
||||
'fastparquet',
|
||||
'gandiva',
|
||||
'gcs',
|
||||
'gdb',
|
||||
'gzip',
|
||||
'hdfs',
|
||||
'large_memory',
|
||||
'lz4',
|
||||
'memory_leak',
|
||||
'nopandas',
|
||||
'orc',
|
||||
'pandas',
|
||||
'parquet',
|
||||
'parquet_encryption',
|
||||
'plasma',
|
||||
's3',
|
||||
'snappy',
|
||||
'substrait',
|
||||
'tensorflow',
|
||||
'flight',
|
||||
'slow',
|
||||
'requires_testing_data',
|
||||
'zstd',
|
||||
]
|
||||
|
||||
defaults = {
|
||||
'brotli': Codec.is_available('brotli'),
|
||||
'bz2': Codec.is_available('bz2'),
|
||||
'cython': False,
|
||||
'dataset': False,
|
||||
'fastparquet': False,
|
||||
'flight': False,
|
||||
'gandiva': False,
|
||||
'gcs': False,
|
||||
'gdb': True,
|
||||
'gzip': Codec.is_available('gzip'),
|
||||
'hdfs': False,
|
||||
'hypothesis': False,
|
||||
'large_memory': False,
|
||||
'lz4': Codec.is_available('lz4'),
|
||||
'memory_leak': False,
|
||||
'nopandas': False,
|
||||
'orc': False,
|
||||
'pandas': False,
|
||||
'parquet': False,
|
||||
'parquet_encryption': False,
|
||||
'plasma': False,
|
||||
'requires_testing_data': True,
|
||||
's3': False,
|
||||
'slow': False,
|
||||
'snappy': Codec.is_available('snappy'),
|
||||
'substrait': False,
|
||||
'tensorflow': False,
|
||||
'zstd': Codec.is_available('zstd'),
|
||||
}
|
||||
|
||||
try:
|
||||
import cython # noqa
|
||||
defaults['cython'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import fastparquet # noqa
|
||||
defaults['fastparquet'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.gandiva # noqa
|
||||
defaults['gandiva'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.dataset # noqa
|
||||
defaults['dataset'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.orc # noqa
|
||||
defaults['orc'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pandas # noqa
|
||||
defaults['pandas'] = True
|
||||
except ImportError:
|
||||
defaults['nopandas'] = True
|
||||
|
||||
try:
|
||||
import pyarrow.parquet # noqa
|
||||
defaults['parquet'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.parquet.encryption # noqa
|
||||
defaults['parquet_encryption'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
import pyarrow.plasma # noqa
|
||||
defaults['plasma'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import tensorflow # noqa
|
||||
defaults['tensorflow'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.flight # noqa
|
||||
defaults['flight'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pyarrow.fs import GcsFileSystem # noqa
|
||||
defaults['gcs'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
try:
|
||||
from pyarrow.fs import S3FileSystem # noqa
|
||||
defaults['s3'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from pyarrow.fs import HadoopFileSystem # noqa
|
||||
defaults['hdfs'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
import pyarrow.substrait # noqa
|
||||
defaults['substrait'] = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
# Doctest should ignore files for the modules that are not built
|
||||
def pytest_ignore_collect(path, config):
|
||||
if config.option.doctestmodules:
|
||||
# don't try to run doctests on the /tests directory
|
||||
if "/pyarrow/tests/" in str(path):
|
||||
return True
|
||||
|
||||
doctest_groups = [
|
||||
'dataset',
|
||||
'orc',
|
||||
'parquet',
|
||||
'plasma',
|
||||
'flight',
|
||||
'substrait',
|
||||
]
|
||||
|
||||
# handle cuda, flight, etc
|
||||
for group in doctest_groups:
|
||||
if 'pyarrow/{}'.format(group) in str(path):
|
||||
if not defaults[group]:
|
||||
return True
|
||||
|
||||
if 'pyarrow/parquet/encryption' in str(path):
|
||||
if not defaults['parquet_encryption']:
|
||||
return True
|
||||
|
||||
if 'pyarrow/cuda' in str(path):
|
||||
try:
|
||||
import pyarrow.cuda # noqa
|
||||
return False
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
if 'pyarrow/fs' in str(path):
|
||||
try:
|
||||
from pyarrow.fs import S3FileSystem # noqa
|
||||
return False
|
||||
except ImportError:
|
||||
return True
|
||||
|
||||
if getattr(config.option, "doctest_cython", False):
|
||||
if "/pyarrow/tests/" in str(path):
|
||||
return True
|
||||
if "/pyarrow/_parquet_encryption" in str(path):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
# Save output files from doctest examples into temp dir
|
||||
@pytest.fixture(autouse=True)
|
||||
def _docdir(request):
|
||||
|
||||
# Trigger ONLY for the doctests
|
||||
doctest_m = request.config.option.doctestmodules
|
||||
doctest_c = getattr(request.config.option, "doctest_cython", False)
|
||||
|
||||
if doctest_m or doctest_c:
|
||||
|
||||
# Get the fixture dynamically by its name.
|
||||
tmpdir = request.getfixturevalue('tmpdir')
|
||||
|
||||
# Chdir only for the duration of the test.
|
||||
with tmpdir.as_cwd():
|
||||
yield
|
||||
|
||||
else:
|
||||
yield
|
||||
|
||||
|
||||
# Define doctest_namespace for fs module docstring import
|
||||
@pytest.fixture(autouse=True)
|
||||
def add_fs(doctest_namespace, request, tmp_path):
|
||||
|
||||
# Trigger ONLY for the doctests
|
||||
doctest_m = request.config.option.doctestmodules
|
||||
doctest_c = getattr(request.config.option, "doctest_cython", False)
|
||||
|
||||
if doctest_m or doctest_c:
|
||||
# fs import
|
||||
doctest_namespace["fs"] = fs
|
||||
|
||||
# Creation of an object and file with data
|
||||
local = fs.LocalFileSystem()
|
||||
path = tmp_path / 'pyarrow-fs-example.dat'
|
||||
with local.open_output_stream(str(path)) as stream:
|
||||
stream.write(b'data')
|
||||
doctest_namespace["local"] = local
|
||||
doctest_namespace["local_path"] = str(tmp_path)
|
||||
doctest_namespace["path"] = str(path)
|
||||
yield
|
||||
22
venv/lib/python3.9/site-packages/pyarrow/csv.py
Normal file
22
venv/lib/python3.9/site-packages/pyarrow/csv.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
from pyarrow._csv import ( # noqa
|
||||
ReadOptions, ParseOptions, ConvertOptions, ISO8601,
|
||||
open_csv, read_csv, CSVStreamingReader, write_csv,
|
||||
WriteOptions, CSVWriter, InvalidRow)
|
||||
25
venv/lib/python3.9/site-packages/pyarrow/cuda.py
Normal file
25
venv/lib/python3.9/site-packages/pyarrow/cuda.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# flake8: noqa
|
||||
|
||||
|
||||
from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer,
|
||||
HostBuffer, BufferReader, BufferWriter,
|
||||
new_host_buffer,
|
||||
serialize_record_batch, read_message,
|
||||
read_record_batch)
|
||||
1003
venv/lib/python3.9/site-packages/pyarrow/dataset.py
Normal file
1003
venv/lib/python3.9/site-packages/pyarrow/dataset.py
Normal file
File diff suppressed because it is too large
Load Diff
258
venv/lib/python3.9/site-packages/pyarrow/error.pxi
Normal file
258
venv/lib/python3.9/site-packages/pyarrow/error.pxi
Normal file
@@ -0,0 +1,258 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetInterrupt
|
||||
|
||||
from pyarrow.includes.libarrow cimport CStatus
|
||||
from pyarrow.includes.libarrow_python cimport IsPyError, RestorePyError
|
||||
from pyarrow.includes.common cimport c_string
|
||||
|
||||
from contextlib import contextmanager
|
||||
import os
|
||||
import signal
|
||||
import threading
|
||||
|
||||
from pyarrow.util import _break_traceback_cycle_from_frame
|
||||
|
||||
|
||||
class ArrowException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowInvalid(ValueError, ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowMemoryError(MemoryError, ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowKeyError(KeyError, ArrowException):
|
||||
def __str__(self):
|
||||
# Override KeyError.__str__, as it uses the repr() of the key
|
||||
return ArrowException.__str__(self)
|
||||
|
||||
|
||||
class ArrowTypeError(TypeError, ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowNotImplementedError(NotImplementedError, ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowCapacityError(ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowIndexError(IndexError, ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowSerializationError(ArrowException):
|
||||
pass
|
||||
|
||||
|
||||
class ArrowCancelled(ArrowException):
|
||||
def __init__(self, message, signum=None):
|
||||
super().__init__(message)
|
||||
self.signum = signum
|
||||
|
||||
|
||||
# Compatibility alias
|
||||
ArrowIOError = IOError
|
||||
|
||||
|
||||
# This function could be written directly in C++ if we didn't
|
||||
# define Arrow-specific subclasses (ArrowInvalid etc.)
|
||||
cdef int check_status(const CStatus& status) nogil except -1:
|
||||
if status.ok():
|
||||
return 0
|
||||
|
||||
with gil:
|
||||
if IsPyError(status):
|
||||
RestorePyError(status)
|
||||
return -1
|
||||
|
||||
# We don't use Status::ToString() as it would redundantly include
|
||||
# the C++ class name.
|
||||
message = frombytes(status.message(), safe=True)
|
||||
detail = status.detail()
|
||||
if detail != nullptr:
|
||||
message += ". Detail: " + frombytes(detail.get().ToString(),
|
||||
safe=True)
|
||||
|
||||
if status.IsInvalid():
|
||||
raise ArrowInvalid(message)
|
||||
elif status.IsIOError():
|
||||
# Note: OSError constructor is
|
||||
# OSError(message)
|
||||
# or
|
||||
# OSError(errno, message, filename=None)
|
||||
# or (on Windows)
|
||||
# OSError(errno, message, filename, winerror)
|
||||
errno = ErrnoFromStatus(status)
|
||||
winerror = WinErrorFromStatus(status)
|
||||
if winerror != 0:
|
||||
raise IOError(errno, message, None, winerror)
|
||||
elif errno != 0:
|
||||
raise IOError(errno, message)
|
||||
else:
|
||||
raise IOError(message)
|
||||
elif status.IsOutOfMemory():
|
||||
raise ArrowMemoryError(message)
|
||||
elif status.IsKeyError():
|
||||
raise ArrowKeyError(message)
|
||||
elif status.IsNotImplemented():
|
||||
raise ArrowNotImplementedError(message)
|
||||
elif status.IsTypeError():
|
||||
raise ArrowTypeError(message)
|
||||
elif status.IsCapacityError():
|
||||
raise ArrowCapacityError(message)
|
||||
elif status.IsIndexError():
|
||||
raise ArrowIndexError(message)
|
||||
elif status.IsSerializationError():
|
||||
raise ArrowSerializationError(message)
|
||||
elif status.IsCancelled():
|
||||
signum = SignalFromStatus(status)
|
||||
if signum > 0:
|
||||
raise ArrowCancelled(message, signum)
|
||||
else:
|
||||
raise ArrowCancelled(message)
|
||||
else:
|
||||
message = frombytes(status.ToString(), safe=True)
|
||||
raise ArrowException(message)
|
||||
|
||||
|
||||
# This is an API function for C++ PyArrow
|
||||
cdef api int pyarrow_internal_check_status(const CStatus& status) \
|
||||
nogil except -1:
|
||||
return check_status(status)
|
||||
|
||||
|
||||
cdef class StopToken:
|
||||
cdef void init(self, CStopToken stop_token):
|
||||
self.stop_token = move(stop_token)
|
||||
|
||||
|
||||
cdef c_bool signal_handlers_enabled = True
|
||||
|
||||
|
||||
def enable_signal_handlers(c_bool enable):
|
||||
"""
|
||||
Enable or disable interruption of long-running operations.
|
||||
|
||||
By default, certain long running operations will detect user
|
||||
interruptions, such as by pressing Ctrl-C. This detection relies
|
||||
on setting a signal handler for the duration of the long-running
|
||||
operation, and may therefore interfere with other frameworks or
|
||||
libraries (such as an event loop).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
enable : bool
|
||||
Whether to enable user interruption by setting a temporary
|
||||
signal handler.
|
||||
"""
|
||||
global signal_handlers_enabled
|
||||
signal_handlers_enabled = enable
|
||||
|
||||
|
||||
# For internal use
|
||||
|
||||
# Whether we need a workaround for https://bugs.python.org/issue42248
|
||||
have_signal_refcycle = (sys.version_info < (3, 8, 10) or
|
||||
(3, 9) <= sys.version_info < (3, 9, 5) or
|
||||
sys.version_info[:2] == (3, 10))
|
||||
|
||||
cdef class SignalStopHandler:
|
||||
cdef:
|
||||
StopToken _stop_token
|
||||
vector[int] _signals
|
||||
c_bool _enabled
|
||||
|
||||
def __cinit__(self):
|
||||
self._enabled = False
|
||||
|
||||
self._init_signals()
|
||||
if have_signal_refcycle:
|
||||
_break_traceback_cycle_from_frame(sys._getframe(0))
|
||||
|
||||
self._stop_token = StopToken()
|
||||
|
||||
if not self._signals.empty():
|
||||
maybe_source = SetSignalStopSource()
|
||||
if not maybe_source.ok():
|
||||
# See ARROW-11841 / ARROW-17173: in complex interaction
|
||||
# scenarios (such as R calling into Python), SetSignalStopSource()
|
||||
# may have already activated a signal-receiving StopSource.
|
||||
# Just warn instead of erroring out.
|
||||
maybe_source.status().Warn()
|
||||
else:
|
||||
self._stop_token.init(deref(maybe_source).token())
|
||||
self._enabled = True
|
||||
|
||||
def _init_signals(self):
|
||||
if (signal_handlers_enabled and
|
||||
threading.current_thread() is threading.main_thread()):
|
||||
self._signals = [
|
||||
sig for sig in (signal.SIGINT, signal.SIGTERM)
|
||||
if signal.getsignal(sig) not in (signal.SIG_DFL,
|
||||
signal.SIG_IGN, None)]
|
||||
|
||||
def __enter__(self):
|
||||
if self._enabled:
|
||||
check_status(RegisterCancellingSignalHandler(self._signals))
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_value, exc_tb):
|
||||
if self._enabled:
|
||||
UnregisterCancellingSignalHandler()
|
||||
if exc_value is None:
|
||||
# Make sure we didn't lose a signal
|
||||
try:
|
||||
check_status(self._stop_token.stop_token.Poll())
|
||||
except ArrowCancelled as e:
|
||||
exc_value = e
|
||||
if isinstance(exc_value, ArrowCancelled):
|
||||
if exc_value.signum:
|
||||
# Re-emit the exact same signal. We restored the Python signal
|
||||
# handler above, so it should receive it.
|
||||
if os.name == 'nt':
|
||||
SendSignal(exc_value.signum)
|
||||
else:
|
||||
SendSignalToThread(exc_value.signum,
|
||||
threading.main_thread().ident)
|
||||
else:
|
||||
# Simulate Python receiving a SIGINT
|
||||
# (see https://bugs.python.org/issue43356 for why we can't
|
||||
# simulate the exact signal number)
|
||||
PyErr_SetInterrupt()
|
||||
# Maximize chances of the Python signal handler being executed now.
|
||||
# Otherwise a potential KeyboardInterrupt might be missed by an
|
||||
# immediately enclosing try/except block.
|
||||
PyErr_CheckSignals()
|
||||
# ArrowCancelled will be re-raised if PyErr_CheckSignals()
|
||||
# returned successfully.
|
||||
|
||||
def __dealloc__(self):
|
||||
if self._enabled:
|
||||
ResetSignalStopSource()
|
||||
|
||||
@property
|
||||
def stop_token(self):
|
||||
return self._stop_token
|
||||
277
venv/lib/python3.9/site-packages/pyarrow/feather.py
Normal file
277
venv/lib/python3.9/site-packages/pyarrow/feather.py
Normal file
@@ -0,0 +1,277 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
import os
|
||||
|
||||
from pyarrow.pandas_compat import _pandas_api # noqa
|
||||
from pyarrow.lib import (Codec, Table, # noqa
|
||||
concat_tables, schema)
|
||||
import pyarrow.lib as ext
|
||||
from pyarrow import _feather
|
||||
from pyarrow._feather import FeatherError # noqa: F401
|
||||
|
||||
|
||||
class FeatherDataset:
|
||||
"""
|
||||
Encapsulates details of reading a list of Feather files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path_or_paths : List[str]
|
||||
A list of file names
|
||||
validate_schema : bool, default True
|
||||
Check that individual file schemas are all the same / compatible
|
||||
"""
|
||||
|
||||
def __init__(self, path_or_paths, validate_schema=True):
|
||||
self.paths = path_or_paths
|
||||
self.validate_schema = validate_schema
|
||||
|
||||
def read_table(self, columns=None):
|
||||
"""
|
||||
Read multiple feather files as a single pyarrow.Table
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : List[str]
|
||||
Names of columns to read from the file
|
||||
|
||||
Returns
|
||||
-------
|
||||
pyarrow.Table
|
||||
Content of the file as a table (of columns)
|
||||
"""
|
||||
_fil = read_table(self.paths[0], columns=columns)
|
||||
self._tables = [_fil]
|
||||
self.schema = _fil.schema
|
||||
|
||||
for path in self.paths[1:]:
|
||||
table = read_table(path, columns=columns)
|
||||
if self.validate_schema:
|
||||
self.validate_schemas(path, table)
|
||||
self._tables.append(table)
|
||||
return concat_tables(self._tables)
|
||||
|
||||
def validate_schemas(self, piece, table):
|
||||
if not self.schema.equals(table.schema):
|
||||
raise ValueError('Schema in {!s} was different. \n'
|
||||
'{!s}\n\nvs\n\n{!s}'
|
||||
.format(piece, self.schema,
|
||||
table.schema))
|
||||
|
||||
def read_pandas(self, columns=None, use_threads=True):
|
||||
"""
|
||||
Read multiple Parquet files as a single pandas DataFrame
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : List[str]
|
||||
Names of columns to read from the file
|
||||
use_threads : bool, default True
|
||||
Use multiple threads when converting to pandas
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
Content of the file as a pandas DataFrame (of columns)
|
||||
"""
|
||||
return self.read_table(columns=columns).to_pandas(
|
||||
use_threads=use_threads)
|
||||
|
||||
|
||||
def check_chunked_overflow(name, col):
|
||||
if col.num_chunks == 1:
|
||||
return
|
||||
|
||||
if col.type in (ext.binary(), ext.string()):
|
||||
raise ValueError("Column '{}' exceeds 2GB maximum capacity of "
|
||||
"a Feather binary column. This restriction may be "
|
||||
"lifted in the future".format(name))
|
||||
else:
|
||||
# TODO(wesm): Not sure when else this might be reached
|
||||
raise ValueError("Column '{}' of type {} was chunked on conversion "
|
||||
"to Arrow and cannot be currently written to "
|
||||
"Feather format".format(name, str(col.type)))
|
||||
|
||||
|
||||
_FEATHER_SUPPORTED_CODECS = {'lz4', 'zstd', 'uncompressed'}
|
||||
|
||||
|
||||
def write_feather(df, dest, compression=None, compression_level=None,
|
||||
chunksize=None, version=2):
|
||||
"""
|
||||
Write a pandas.DataFrame to Feather format.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pandas.DataFrame or pyarrow.Table
|
||||
Data to write out as Feather format.
|
||||
dest : str
|
||||
Local destination path.
|
||||
compression : string, default None
|
||||
Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses
|
||||
LZ4 for V2 files if it is available, otherwise uncompressed.
|
||||
compression_level : int, default None
|
||||
Use a compression level particular to the chosen compressor. If None
|
||||
use the default compression level
|
||||
chunksize : int, default None
|
||||
For V2 files, the internal maximum size of Arrow RecordBatch chunks
|
||||
when writing the Arrow IPC file format. None means use the default,
|
||||
which is currently 64K
|
||||
version : int, default 2
|
||||
Feather file version. Version 2 is the current. Version 1 is the more
|
||||
limited legacy format
|
||||
"""
|
||||
if _pandas_api.have_pandas:
|
||||
if (_pandas_api.has_sparse and
|
||||
isinstance(df, _pandas_api.pd.SparseDataFrame)):
|
||||
df = df.to_dense()
|
||||
|
||||
if _pandas_api.is_data_frame(df):
|
||||
# Feather v1 creates a new column in the resultant Table to
|
||||
# store index information if index type is not RangeIndex
|
||||
|
||||
if version == 1:
|
||||
preserve_index = False
|
||||
elif version == 2:
|
||||
preserve_index = None
|
||||
else:
|
||||
raise ValueError("Version value should either be 1 or 2")
|
||||
|
||||
table = Table.from_pandas(df, preserve_index=preserve_index)
|
||||
|
||||
if version == 1:
|
||||
# Version 1 does not chunking
|
||||
for i, name in enumerate(table.schema.names):
|
||||
col = table[i]
|
||||
check_chunked_overflow(name, col)
|
||||
else:
|
||||
table = df
|
||||
|
||||
if version == 1:
|
||||
if len(table.column_names) > len(set(table.column_names)):
|
||||
raise ValueError("cannot serialize duplicate column names")
|
||||
|
||||
if compression is not None:
|
||||
raise ValueError("Feather V1 files do not support compression "
|
||||
"option")
|
||||
|
||||
if chunksize is not None:
|
||||
raise ValueError("Feather V1 files do not support chunksize "
|
||||
"option")
|
||||
else:
|
||||
if compression is None and Codec.is_available('lz4_frame'):
|
||||
compression = 'lz4'
|
||||
elif (compression is not None and
|
||||
compression not in _FEATHER_SUPPORTED_CODECS):
|
||||
raise ValueError('compression="{}" not supported, must be '
|
||||
'one of {}'.format(compression,
|
||||
_FEATHER_SUPPORTED_CODECS))
|
||||
|
||||
try:
|
||||
_feather.write_feather(table, dest, compression=compression,
|
||||
compression_level=compression_level,
|
||||
chunksize=chunksize, version=version)
|
||||
except Exception:
|
||||
if isinstance(dest, str):
|
||||
try:
|
||||
os.remove(dest)
|
||||
except os.error:
|
||||
pass
|
||||
raise
|
||||
|
||||
|
||||
def read_feather(source, columns=None, use_threads=True,
|
||||
memory_map=False, **kwargs):
|
||||
"""
|
||||
Read a pandas.DataFrame from Feather format. To read as pyarrow.Table use
|
||||
feather.read_table.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str file path, or file-like object
|
||||
You can use MemoryMappedFile as source, for explicitly use memory map.
|
||||
columns : sequence, optional
|
||||
Only read a specific set of columns. If not provided, all columns are
|
||||
read.
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads. If false the
|
||||
restriction is used in the conversion to Pandas as well as in the
|
||||
reading from Feather format.
|
||||
memory_map : boolean, default False
|
||||
Use memory mapping when opening file on disk, when source is a str.
|
||||
**kwargs
|
||||
Additional keyword arguments passed on to `pyarrow.Table.to_pandas`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
df : pandas.DataFrame
|
||||
The contents of the Feather file as a pandas.DataFrame
|
||||
"""
|
||||
return (read_table(
|
||||
source, columns=columns, memory_map=memory_map,
|
||||
use_threads=use_threads).to_pandas(use_threads=use_threads, **kwargs))
|
||||
|
||||
|
||||
def read_table(source, columns=None, memory_map=False, use_threads=True):
|
||||
"""
|
||||
Read a pyarrow.Table from Feather format
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str file path, or file-like object
|
||||
You can use MemoryMappedFile as source, for explicitly use memory map.
|
||||
columns : sequence, optional
|
||||
Only read a specific set of columns. If not provided, all columns are
|
||||
read.
|
||||
memory_map : boolean, default False
|
||||
Use memory mapping when opening file on disk, when source is a str
|
||||
use_threads : bool, default True
|
||||
Whether to parallelize reading using multiple threads.
|
||||
|
||||
Returns
|
||||
-------
|
||||
table : pyarrow.Table
|
||||
The contents of the Feather file as a pyarrow.Table
|
||||
"""
|
||||
reader = _feather.FeatherReader(
|
||||
source, use_memory_map=memory_map, use_threads=use_threads)
|
||||
|
||||
if columns is None:
|
||||
return reader.read()
|
||||
|
||||
column_types = [type(column) for column in columns]
|
||||
if all(map(lambda t: t == int, column_types)):
|
||||
table = reader.read_indices(columns)
|
||||
elif all(map(lambda t: t == str, column_types)):
|
||||
table = reader.read_names(columns)
|
||||
else:
|
||||
column_type_names = [t.__name__ for t in column_types]
|
||||
raise TypeError("Columns must be indices or names. "
|
||||
"Got columns {} of types {}"
|
||||
.format(columns, column_type_names))
|
||||
|
||||
# Feather v1 already respects the column selection
|
||||
if reader.version < 3:
|
||||
return table
|
||||
# Feather v2 reads with sorted / deduplicated selection
|
||||
elif sorted(set(columns)) == columns:
|
||||
return table
|
||||
else:
|
||||
# follow exact order / selection of names
|
||||
return table.select(columns)
|
||||
511
venv/lib/python3.9/site-packages/pyarrow/filesystem.py
Normal file
511
venv/lib/python3.9/site-packages/pyarrow/filesystem.py
Normal file
@@ -0,0 +1,511 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
import os
|
||||
import posixpath
|
||||
import sys
|
||||
import urllib.parse
|
||||
import warnings
|
||||
|
||||
from os.path import join as pjoin
|
||||
|
||||
import pyarrow as pa
|
||||
from pyarrow.util import implements, _stringify_path, _is_path_like, _DEPR_MSG
|
||||
|
||||
|
||||
_FS_DEPR_MSG = _DEPR_MSG.format(
|
||||
"filesystem.LocalFileSystem", "2.0.0", "fs.LocalFileSystem"
|
||||
)
|
||||
|
||||
|
||||
class FileSystem:
|
||||
"""
|
||||
Abstract filesystem interface.
|
||||
"""
|
||||
|
||||
def cat(self, path):
|
||||
"""
|
||||
Return contents of file as a bytes object.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
File path to read content from.
|
||||
|
||||
Returns
|
||||
-------
|
||||
contents : bytes
|
||||
"""
|
||||
with self.open(path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def ls(self, path):
|
||||
"""
|
||||
Return list of file paths.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Directory to list contents from.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def delete(self, path, recursive=False):
|
||||
"""
|
||||
Delete the indicated file or directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to delete.
|
||||
recursive : bool, default False
|
||||
If True, also delete child paths for directories.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def disk_usage(self, path):
|
||||
"""
|
||||
Compute bytes used by all contents under indicated path in file tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Can be a file path or directory.
|
||||
|
||||
Returns
|
||||
-------
|
||||
usage : int
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
path_info = self.stat(path)
|
||||
if path_info['kind'] == 'file':
|
||||
return path_info['size']
|
||||
|
||||
total = 0
|
||||
for root, directories, files in self.walk(path):
|
||||
for child_path in files:
|
||||
abspath = self._path_join(root, child_path)
|
||||
total += self.stat(abspath)['size']
|
||||
|
||||
return total
|
||||
|
||||
def _path_join(self, *args):
|
||||
return self.pathsep.join(args)
|
||||
|
||||
def stat(self, path):
|
||||
"""
|
||||
Information about a filesystem entry.
|
||||
|
||||
Returns
|
||||
-------
|
||||
stat : dict
|
||||
"""
|
||||
raise NotImplementedError('FileSystem.stat')
|
||||
|
||||
def rm(self, path, recursive=False):
|
||||
"""
|
||||
Alias for FileSystem.delete.
|
||||
"""
|
||||
return self.delete(path, recursive=recursive)
|
||||
|
||||
def mv(self, path, new_path):
|
||||
"""
|
||||
Alias for FileSystem.rename.
|
||||
"""
|
||||
return self.rename(path, new_path)
|
||||
|
||||
def rename(self, path, new_path):
|
||||
"""
|
||||
Rename file, like UNIX mv command.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to alter.
|
||||
new_path : str
|
||||
Path to move to.
|
||||
"""
|
||||
raise NotImplementedError('FileSystem.rename')
|
||||
|
||||
def mkdir(self, path, create_parents=True):
|
||||
"""
|
||||
Create a directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to the directory.
|
||||
create_parents : bool, default True
|
||||
If the parent directories don't exists create them as well.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def exists(self, path):
|
||||
"""
|
||||
Return True if path exists.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to check.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def isdir(self, path):
|
||||
"""
|
||||
Return True if path is a directory.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to check.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def isfile(self, path):
|
||||
"""
|
||||
Return True if path is a file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Path to check.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _isfilestore(self):
|
||||
"""
|
||||
Returns True if this FileSystem is a unix-style file store with
|
||||
directories.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def read_parquet(self, path, columns=None, metadata=None, schema=None,
|
||||
use_threads=True, use_pandas_metadata=False):
|
||||
"""
|
||||
Read Parquet data from path in file system. Can read from a single file
|
||||
or a directory of files.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Single file path or directory
|
||||
columns : List[str], optional
|
||||
Subset of columns to read.
|
||||
metadata : pyarrow.parquet.FileMetaData
|
||||
Known metadata to validate files against.
|
||||
schema : pyarrow.parquet.Schema
|
||||
Known schema to validate files against. Alternative to metadata
|
||||
argument.
|
||||
use_threads : bool, default True
|
||||
Perform multi-threaded column reads.
|
||||
use_pandas_metadata : bool, default False
|
||||
If True and file has custom pandas schema metadata, ensure that
|
||||
index columns are also loaded.
|
||||
|
||||
Returns
|
||||
-------
|
||||
table : pyarrow.Table
|
||||
"""
|
||||
from pyarrow.parquet import ParquetDataset
|
||||
dataset = ParquetDataset(path, schema=schema, metadata=metadata,
|
||||
filesystem=self)
|
||||
return dataset.read(columns=columns, use_threads=use_threads,
|
||||
use_pandas_metadata=use_pandas_metadata)
|
||||
|
||||
def open(self, path, mode='rb'):
|
||||
"""
|
||||
Open file for reading or writing.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def pathsep(self):
|
||||
return '/'
|
||||
|
||||
|
||||
class LocalFileSystem(FileSystem):
|
||||
|
||||
_instance = None
|
||||
|
||||
def __init__(self):
|
||||
warnings.warn(_FS_DEPR_MSG, FutureWarning, stacklevel=2)
|
||||
super().__init__()
|
||||
|
||||
@classmethod
|
||||
def _get_instance(cls):
|
||||
if cls._instance is None:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
cls._instance = LocalFileSystem()
|
||||
return cls._instance
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls):
|
||||
warnings.warn(_FS_DEPR_MSG, FutureWarning, stacklevel=2)
|
||||
return cls._get_instance()
|
||||
|
||||
@implements(FileSystem.ls)
|
||||
def ls(self, path):
|
||||
path = _stringify_path(path)
|
||||
return sorted(pjoin(path, x) for x in os.listdir(path))
|
||||
|
||||
@implements(FileSystem.mkdir)
|
||||
def mkdir(self, path, create_parents=True):
|
||||
path = _stringify_path(path)
|
||||
if create_parents:
|
||||
os.makedirs(path)
|
||||
else:
|
||||
os.mkdir(path)
|
||||
|
||||
@implements(FileSystem.isdir)
|
||||
def isdir(self, path):
|
||||
path = _stringify_path(path)
|
||||
return os.path.isdir(path)
|
||||
|
||||
@implements(FileSystem.isfile)
|
||||
def isfile(self, path):
|
||||
path = _stringify_path(path)
|
||||
return os.path.isfile(path)
|
||||
|
||||
@implements(FileSystem._isfilestore)
|
||||
def _isfilestore(self):
|
||||
return True
|
||||
|
||||
@implements(FileSystem.exists)
|
||||
def exists(self, path):
|
||||
path = _stringify_path(path)
|
||||
return os.path.exists(path)
|
||||
|
||||
@implements(FileSystem.open)
|
||||
def open(self, path, mode='rb'):
|
||||
"""
|
||||
Open file for reading or writing.
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
return open(path, mode=mode)
|
||||
|
||||
@property
|
||||
def pathsep(self):
|
||||
return os.path.sep
|
||||
|
||||
def walk(self, path):
|
||||
"""
|
||||
Directory tree generator, see os.walk.
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
return os.walk(path)
|
||||
|
||||
|
||||
class DaskFileSystem(FileSystem):
|
||||
"""
|
||||
Wraps s3fs Dask filesystem implementation like s3fs, gcsfs, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, fs):
|
||||
warnings.warn(
|
||||
"The pyarrow.filesystem.DaskFileSystem/S3FSWrapper are deprecated "
|
||||
"as of pyarrow 3.0.0, and will be removed in a future version.",
|
||||
FutureWarning, stacklevel=2)
|
||||
self.fs = fs
|
||||
|
||||
@implements(FileSystem.isdir)
|
||||
def isdir(self, path):
|
||||
raise NotImplementedError("Unsupported file system API")
|
||||
|
||||
@implements(FileSystem.isfile)
|
||||
def isfile(self, path):
|
||||
raise NotImplementedError("Unsupported file system API")
|
||||
|
||||
@implements(FileSystem._isfilestore)
|
||||
def _isfilestore(self):
|
||||
"""
|
||||
Object Stores like S3 and GCSFS are based on key lookups, not true
|
||||
file-paths.
|
||||
"""
|
||||
return False
|
||||
|
||||
@implements(FileSystem.delete)
|
||||
def delete(self, path, recursive=False):
|
||||
path = _stringify_path(path)
|
||||
return self.fs.rm(path, recursive=recursive)
|
||||
|
||||
@implements(FileSystem.exists)
|
||||
def exists(self, path):
|
||||
path = _stringify_path(path)
|
||||
return self.fs.exists(path)
|
||||
|
||||
@implements(FileSystem.mkdir)
|
||||
def mkdir(self, path, create_parents=True):
|
||||
path = _stringify_path(path)
|
||||
if create_parents:
|
||||
return self.fs.mkdirs(path)
|
||||
else:
|
||||
return self.fs.mkdir(path)
|
||||
|
||||
@implements(FileSystem.open)
|
||||
def open(self, path, mode='rb'):
|
||||
"""
|
||||
Open file for reading or writing.
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
return self.fs.open(path, mode=mode)
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
path = _stringify_path(path)
|
||||
return self.fs.ls(path, detail=detail)
|
||||
|
||||
def walk(self, path):
|
||||
"""
|
||||
Directory tree generator, like os.walk.
|
||||
"""
|
||||
path = _stringify_path(path)
|
||||
return self.fs.walk(path)
|
||||
|
||||
|
||||
class S3FSWrapper(DaskFileSystem):
|
||||
|
||||
@implements(FileSystem.isdir)
|
||||
def isdir(self, path):
|
||||
path = _sanitize_s3(_stringify_path(path))
|
||||
try:
|
||||
contents = self.fs.ls(path)
|
||||
if len(contents) == 1 and contents[0] == path:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
@implements(FileSystem.isfile)
|
||||
def isfile(self, path):
|
||||
path = _sanitize_s3(_stringify_path(path))
|
||||
try:
|
||||
contents = self.fs.ls(path)
|
||||
return len(contents) == 1 and contents[0] == path
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
def walk(self, path, refresh=False):
|
||||
"""
|
||||
Directory tree generator, like os.walk.
|
||||
|
||||
Generator version of what is in s3fs, which yields a flattened list of
|
||||
files.
|
||||
"""
|
||||
path = _sanitize_s3(_stringify_path(path))
|
||||
directories = set()
|
||||
files = set()
|
||||
|
||||
for key in list(self.fs._ls(path, refresh=refresh)):
|
||||
path = key['Key']
|
||||
if key['StorageClass'] == 'DIRECTORY':
|
||||
directories.add(path)
|
||||
elif key['StorageClass'] == 'BUCKET':
|
||||
pass
|
||||
else:
|
||||
files.add(path)
|
||||
|
||||
# s3fs creates duplicate 'DIRECTORY' entries
|
||||
files = sorted([posixpath.split(f)[1] for f in files
|
||||
if f not in directories])
|
||||
directories = sorted([posixpath.split(x)[1]
|
||||
for x in directories])
|
||||
|
||||
yield path, directories, files
|
||||
|
||||
for directory in directories:
|
||||
yield from self.walk(directory, refresh=refresh)
|
||||
|
||||
|
||||
def _sanitize_s3(path):
|
||||
if path.startswith('s3://'):
|
||||
return path.replace('s3://', '')
|
||||
else:
|
||||
return path
|
||||
|
||||
|
||||
def _ensure_filesystem(fs):
|
||||
fs_type = type(fs)
|
||||
|
||||
# If the arrow filesystem was subclassed, assume it supports the full
|
||||
# interface and return it
|
||||
if not issubclass(fs_type, FileSystem):
|
||||
if "fsspec" in sys.modules:
|
||||
fsspec = sys.modules["fsspec"]
|
||||
if isinstance(fs, fsspec.AbstractFileSystem):
|
||||
# for recent fsspec versions that stop inheriting from
|
||||
# pyarrow.filesystem.FileSystem, still allow fsspec
|
||||
# filesystems (which should be compatible with our legacy fs)
|
||||
return fs
|
||||
|
||||
raise OSError('Unrecognized filesystem: {}'.format(fs_type))
|
||||
else:
|
||||
return fs
|
||||
|
||||
|
||||
def resolve_filesystem_and_path(where, filesystem=None):
|
||||
"""
|
||||
Return filesystem from path which could be an HDFS URI, a local URI,
|
||||
or a plain filesystem path.
|
||||
"""
|
||||
if not _is_path_like(where):
|
||||
if filesystem is not None:
|
||||
raise ValueError("filesystem passed but where is file-like, so"
|
||||
" there is nothing to open with filesystem.")
|
||||
return filesystem, where
|
||||
|
||||
if filesystem is not None:
|
||||
filesystem = _ensure_filesystem(filesystem)
|
||||
if isinstance(filesystem, LocalFileSystem):
|
||||
path = _stringify_path(where)
|
||||
elif not isinstance(where, str):
|
||||
raise TypeError(
|
||||
"Expected string path; path-like objects are only allowed "
|
||||
"with a local filesystem"
|
||||
)
|
||||
else:
|
||||
path = where
|
||||
return filesystem, path
|
||||
|
||||
path = _stringify_path(where)
|
||||
|
||||
parsed_uri = urllib.parse.urlparse(path)
|
||||
if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
|
||||
# Input is hdfs URI such as hdfs://host:port/myfile.parquet
|
||||
netloc_split = parsed_uri.netloc.split(':')
|
||||
host = netloc_split[0]
|
||||
if host == '':
|
||||
host = 'default'
|
||||
else:
|
||||
host = parsed_uri.scheme + "://" + host
|
||||
port = 0
|
||||
if len(netloc_split) == 2 and netloc_split[1].isnumeric():
|
||||
port = int(netloc_split[1])
|
||||
fs = pa.hdfs._connect(host=host, port=port)
|
||||
fs_path = parsed_uri.path
|
||||
elif parsed_uri.scheme == 'file':
|
||||
# Input is local URI such as file:///home/user/myfile.parquet
|
||||
fs = LocalFileSystem._get_instance()
|
||||
fs_path = parsed_uri.path
|
||||
else:
|
||||
# Input is local path such as /home/user/myfile.parquet
|
||||
fs = LocalFileSystem._get_instance()
|
||||
fs_path = path
|
||||
|
||||
return fs, fs_path
|
||||
64
venv/lib/python3.9/site-packages/pyarrow/flight.py
Normal file
64
venv/lib/python3.9/site-packages/pyarrow/flight.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
from pyarrow._flight import ( # noqa:F401
|
||||
connect,
|
||||
Action,
|
||||
ActionType,
|
||||
BasicAuth,
|
||||
CallInfo,
|
||||
CertKeyPair,
|
||||
ClientAuthHandler,
|
||||
ClientMiddleware,
|
||||
ClientMiddlewareFactory,
|
||||
DescriptorType,
|
||||
FlightCallOptions,
|
||||
FlightCancelledError,
|
||||
FlightClient,
|
||||
FlightDataStream,
|
||||
FlightDescriptor,
|
||||
FlightEndpoint,
|
||||
FlightError,
|
||||
FlightInfo,
|
||||
FlightInternalError,
|
||||
FlightMetadataReader,
|
||||
FlightMetadataWriter,
|
||||
FlightMethod,
|
||||
FlightServerBase,
|
||||
FlightServerError,
|
||||
FlightStreamChunk,
|
||||
FlightStreamReader,
|
||||
FlightStreamWriter,
|
||||
FlightTimedOutError,
|
||||
FlightUnauthenticatedError,
|
||||
FlightUnauthorizedError,
|
||||
FlightUnavailableError,
|
||||
FlightWriteSizeExceededError,
|
||||
GeneratorStream,
|
||||
Location,
|
||||
MetadataRecordBatchReader,
|
||||
MetadataRecordBatchWriter,
|
||||
RecordBatchStream,
|
||||
Result,
|
||||
SchemaResult,
|
||||
ServerAuthHandler,
|
||||
ServerCallContext,
|
||||
ServerMiddleware,
|
||||
ServerMiddlewareFactory,
|
||||
Ticket,
|
||||
TracingServerMiddlewareFactory,
|
||||
)
|
||||
434
venv/lib/python3.9/site-packages/pyarrow/fs.py
Normal file
434
venv/lib/python3.9/site-packages/pyarrow/fs.py
Normal file
@@ -0,0 +1,434 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""
|
||||
FileSystem abstraction to interact with various local and remote filesystems.
|
||||
"""
|
||||
|
||||
from pyarrow.util import _is_path_like, _stringify_path
|
||||
|
||||
from pyarrow._fs import ( # noqa
|
||||
FileSelector,
|
||||
FileType,
|
||||
FileInfo,
|
||||
FileSystem,
|
||||
LocalFileSystem,
|
||||
SubTreeFileSystem,
|
||||
_MockFileSystem,
|
||||
FileSystemHandler,
|
||||
PyFileSystem,
|
||||
_copy_files,
|
||||
_copy_files_selector,
|
||||
)
|
||||
|
||||
# For backward compatibility.
|
||||
FileStats = FileInfo
|
||||
|
||||
_not_imported = []
|
||||
|
||||
try:
|
||||
from pyarrow._hdfs import HadoopFileSystem # noqa
|
||||
except ImportError:
|
||||
_not_imported.append("HadoopFileSystem")
|
||||
|
||||
try:
|
||||
from pyarrow._gcsfs import GcsFileSystem # noqa
|
||||
except ImportError:
|
||||
_not_imported.append("GcsFileSystem")
|
||||
|
||||
try:
|
||||
from pyarrow._s3fs import ( # noqa
|
||||
AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy,
|
||||
S3FileSystem, S3LogLevel, S3RetryStrategy, finalize_s3,
|
||||
initialize_s3, resolve_s3_region)
|
||||
except ImportError:
|
||||
_not_imported.append("S3FileSystem")
|
||||
else:
|
||||
initialize_s3()
|
||||
|
||||
|
||||
def __getattr__(name):
|
||||
if name in _not_imported:
|
||||
raise ImportError(
|
||||
"The pyarrow installation is not built with support for "
|
||||
"'{0}'".format(name)
|
||||
)
|
||||
|
||||
raise AttributeError(
|
||||
"module 'pyarrow.fs' has no attribute '{0}'".format(name)
|
||||
)
|
||||
|
||||
|
||||
def _filesystem_from_str(uri):
|
||||
# instantiate the file system from an uri, if the uri has a path
|
||||
# component then it will be treated as a path prefix
|
||||
filesystem, prefix = FileSystem.from_uri(uri)
|
||||
prefix = filesystem.normalize_path(prefix)
|
||||
if prefix:
|
||||
# validate that the prefix is pointing to a directory
|
||||
prefix_info = filesystem.get_file_info([prefix])[0]
|
||||
if prefix_info.type != FileType.Directory:
|
||||
raise ValueError(
|
||||
"The path component of the filesystem URI must point to a "
|
||||
"directory but it has a type: `{}`. The path component "
|
||||
"is `{}` and the given filesystem URI is `{}`".format(
|
||||
prefix_info.type.name, prefix_info.path, uri
|
||||
)
|
||||
)
|
||||
filesystem = SubTreeFileSystem(prefix, filesystem)
|
||||
return filesystem
|
||||
|
||||
|
||||
def _ensure_filesystem(
|
||||
filesystem, use_mmap=False, allow_legacy_filesystem=False
|
||||
):
|
||||
if isinstance(filesystem, FileSystem):
|
||||
return filesystem
|
||||
elif isinstance(filesystem, str):
|
||||
if use_mmap:
|
||||
raise ValueError(
|
||||
"Specifying to use memory mapping not supported for "
|
||||
"filesystem specified as an URI string"
|
||||
)
|
||||
return _filesystem_from_str(filesystem)
|
||||
|
||||
# handle fsspec-compatible filesystems
|
||||
try:
|
||||
import fsspec
|
||||
except ImportError:
|
||||
pass
|
||||
else:
|
||||
if isinstance(filesystem, fsspec.AbstractFileSystem):
|
||||
if type(filesystem).__name__ == 'LocalFileSystem':
|
||||
# In case its a simple LocalFileSystem, use native arrow one
|
||||
return LocalFileSystem(use_mmap=use_mmap)
|
||||
return PyFileSystem(FSSpecHandler(filesystem))
|
||||
|
||||
# map old filesystems to new ones
|
||||
import pyarrow.filesystem as legacyfs
|
||||
|
||||
if isinstance(filesystem, legacyfs.LocalFileSystem):
|
||||
return LocalFileSystem(use_mmap=use_mmap)
|
||||
# TODO handle HDFS?
|
||||
if allow_legacy_filesystem and isinstance(filesystem, legacyfs.FileSystem):
|
||||
return filesystem
|
||||
|
||||
raise TypeError(
|
||||
"Unrecognized filesystem: {}. `filesystem` argument must be a "
|
||||
"FileSystem instance or a valid file system URI'".format(
|
||||
type(filesystem))
|
||||
)
|
||||
|
||||
|
||||
def _resolve_filesystem_and_path(
|
||||
path, filesystem=None, allow_legacy_filesystem=False, memory_map=False
|
||||
):
|
||||
"""
|
||||
Return filesystem/path from path which could be an URI or a plain
|
||||
filesystem path.
|
||||
"""
|
||||
if not _is_path_like(path):
|
||||
if filesystem is not None:
|
||||
raise ValueError(
|
||||
"'filesystem' passed but the specified path is file-like, so"
|
||||
" there is nothing to open with 'filesystem'."
|
||||
)
|
||||
return filesystem, path
|
||||
|
||||
if filesystem is not None:
|
||||
filesystem = _ensure_filesystem(
|
||||
filesystem, use_mmap=memory_map,
|
||||
allow_legacy_filesystem=allow_legacy_filesystem
|
||||
)
|
||||
if isinstance(filesystem, LocalFileSystem):
|
||||
path = _stringify_path(path)
|
||||
elif not isinstance(path, str):
|
||||
raise TypeError(
|
||||
"Expected string path; path-like objects are only allowed "
|
||||
"with a local filesystem"
|
||||
)
|
||||
if not allow_legacy_filesystem:
|
||||
path = filesystem.normalize_path(path)
|
||||
return filesystem, path
|
||||
|
||||
path = _stringify_path(path)
|
||||
|
||||
# if filesystem is not given, try to automatically determine one
|
||||
# first check if the file exists as a local (relative) file path
|
||||
# if not then try to parse the path as an URI
|
||||
filesystem = LocalFileSystem(use_mmap=memory_map)
|
||||
|
||||
try:
|
||||
file_info = filesystem.get_file_info(path)
|
||||
except ValueError: # ValueError means path is likely an URI
|
||||
file_info = None
|
||||
exists_locally = False
|
||||
else:
|
||||
exists_locally = (file_info.type != FileType.NotFound)
|
||||
|
||||
# if the file or directory doesn't exists locally, then assume that
|
||||
# the path is an URI describing the file system as well
|
||||
if not exists_locally:
|
||||
try:
|
||||
filesystem, path = FileSystem.from_uri(path)
|
||||
except ValueError as e:
|
||||
# neither an URI nor a locally existing path, so assume that
|
||||
# local path was given and propagate a nicer file not found error
|
||||
# instead of a more confusing scheme parsing error
|
||||
if "empty scheme" not in str(e) \
|
||||
and "Cannot parse URI" not in str(e):
|
||||
raise
|
||||
else:
|
||||
path = filesystem.normalize_path(path)
|
||||
|
||||
return filesystem, path
|
||||
|
||||
|
||||
def copy_files(source, destination,
|
||||
source_filesystem=None, destination_filesystem=None,
|
||||
*, chunk_size=1024*1024, use_threads=True):
|
||||
"""
|
||||
Copy files between FileSystems.
|
||||
|
||||
This functions allows you to recursively copy directories of files from
|
||||
one file system to another, such as from S3 to your local machine.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : string
|
||||
Source file path or URI to a single file or directory.
|
||||
If a directory, files will be copied recursively from this path.
|
||||
destination : string
|
||||
Destination file path or URI. If `source` is a file, `destination`
|
||||
is also interpreted as the destination file (not directory).
|
||||
Directories will be created as necessary.
|
||||
source_filesystem : FileSystem, optional
|
||||
Source filesystem, needs to be specified if `source` is not a URI,
|
||||
otherwise inferred.
|
||||
destination_filesystem : FileSystem, optional
|
||||
Destination filesystem, needs to be specified if `destination` is not
|
||||
a URI, otherwise inferred.
|
||||
chunk_size : int, default 1MB
|
||||
The maximum size of block to read before flushing to the
|
||||
destination file. A larger chunk_size will use more memory while
|
||||
copying but may help accommodate high latency FileSystems.
|
||||
use_threads : bool, default True
|
||||
Whether to use multiple threads to accelerate copying.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Inspect an S3 bucket's files:
|
||||
|
||||
>>> s3, path = fs.FileSystem.from_uri(
|
||||
... "s3://registry.opendata.aws/roda/ndjson/")
|
||||
>>> selector = fs.FileSelector(path)
|
||||
>>> s3.get_file_info(selector)
|
||||
[<FileInfo for 'registry.opendata.aws/roda/ndjson/index.ndjson':...]
|
||||
|
||||
Copy one file from S3 bucket to a local directory:
|
||||
|
||||
>>> fs.copy_files("s3://registry.opendata.aws/roda/ndjson/index.ndjson",
|
||||
... "file:///{}/index_copy.ndjson".format(local_path))
|
||||
|
||||
>>> fs.LocalFileSystem().get_file_info(str(local_path)+
|
||||
... '/index_copy.ndjson')
|
||||
<FileInfo for '.../index_copy.ndjson': type=FileType.File, size=...>
|
||||
|
||||
Copy file using a FileSystem object:
|
||||
|
||||
>>> fs.copy_files("registry.opendata.aws/roda/ndjson/index.ndjson",
|
||||
... "file:///{}/index_copy.ndjson".format(local_path),
|
||||
... source_filesystem=fs.S3FileSystem())
|
||||
"""
|
||||
source_fs, source_path = _resolve_filesystem_and_path(
|
||||
source, source_filesystem
|
||||
)
|
||||
destination_fs, destination_path = _resolve_filesystem_and_path(
|
||||
destination, destination_filesystem
|
||||
)
|
||||
|
||||
file_info = source_fs.get_file_info(source_path)
|
||||
if file_info.type == FileType.Directory:
|
||||
source_sel = FileSelector(source_path, recursive=True)
|
||||
_copy_files_selector(source_fs, source_sel,
|
||||
destination_fs, destination_path,
|
||||
chunk_size, use_threads)
|
||||
else:
|
||||
_copy_files(source_fs, source_path,
|
||||
destination_fs, destination_path,
|
||||
chunk_size, use_threads)
|
||||
|
||||
|
||||
class FSSpecHandler(FileSystemHandler):
|
||||
"""
|
||||
Handler for fsspec-based Python filesystems.
|
||||
|
||||
https://filesystem-spec.readthedocs.io/en/latest/index.html
|
||||
|
||||
Parameters
|
||||
----------
|
||||
fs : FSSpec-compliant filesystem instance
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> PyFileSystem(FSSpecHandler(fsspec_fs)) # doctest: +SKIP
|
||||
"""
|
||||
|
||||
def __init__(self, fs):
|
||||
self.fs = fs
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, FSSpecHandler):
|
||||
return self.fs == other.fs
|
||||
return NotImplemented
|
||||
|
||||
def __ne__(self, other):
|
||||
if isinstance(other, FSSpecHandler):
|
||||
return self.fs != other.fs
|
||||
return NotImplemented
|
||||
|
||||
def get_type_name(self):
|
||||
protocol = self.fs.protocol
|
||||
if isinstance(protocol, list):
|
||||
protocol = protocol[0]
|
||||
return "fsspec+{0}".format(protocol)
|
||||
|
||||
def normalize_path(self, path):
|
||||
return path
|
||||
|
||||
@staticmethod
|
||||
def _create_file_info(path, info):
|
||||
size = info["size"]
|
||||
if info["type"] == "file":
|
||||
ftype = FileType.File
|
||||
elif info["type"] == "directory":
|
||||
ftype = FileType.Directory
|
||||
# some fsspec filesystems include a file size for directories
|
||||
size = None
|
||||
else:
|
||||
ftype = FileType.Unknown
|
||||
return FileInfo(path, ftype, size=size, mtime=info.get("mtime", None))
|
||||
|
||||
def get_file_info(self, paths):
|
||||
infos = []
|
||||
for path in paths:
|
||||
try:
|
||||
info = self.fs.info(path)
|
||||
except FileNotFoundError:
|
||||
infos.append(FileInfo(path, FileType.NotFound))
|
||||
else:
|
||||
infos.append(self._create_file_info(path, info))
|
||||
return infos
|
||||
|
||||
def get_file_info_selector(self, selector):
|
||||
if not self.fs.isdir(selector.base_dir):
|
||||
if self.fs.exists(selector.base_dir):
|
||||
raise NotADirectoryError(selector.base_dir)
|
||||
else:
|
||||
if selector.allow_not_found:
|
||||
return []
|
||||
else:
|
||||
raise FileNotFoundError(selector.base_dir)
|
||||
|
||||
if selector.recursive:
|
||||
maxdepth = None
|
||||
else:
|
||||
maxdepth = 1
|
||||
|
||||
infos = []
|
||||
selected_files = self.fs.find(
|
||||
selector.base_dir, maxdepth=maxdepth, withdirs=True, detail=True
|
||||
)
|
||||
for path, info in selected_files.items():
|
||||
infos.append(self._create_file_info(path, info))
|
||||
|
||||
return infos
|
||||
|
||||
def create_dir(self, path, recursive):
|
||||
# mkdir also raises FileNotFoundError when base directory is not found
|
||||
try:
|
||||
self.fs.mkdir(path, create_parents=recursive)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
def delete_dir(self, path):
|
||||
self.fs.rm(path, recursive=True)
|
||||
|
||||
def _delete_dir_contents(self, path, missing_dir_ok):
|
||||
try:
|
||||
subpaths = self.fs.listdir(path, detail=False)
|
||||
except FileNotFoundError:
|
||||
if missing_dir_ok:
|
||||
return
|
||||
raise
|
||||
for subpath in subpaths:
|
||||
if self.fs.isdir(subpath):
|
||||
self.fs.rm(subpath, recursive=True)
|
||||
elif self.fs.isfile(subpath):
|
||||
self.fs.rm(subpath)
|
||||
|
||||
def delete_dir_contents(self, path, missing_dir_ok):
|
||||
if path.strip("/") == "":
|
||||
raise ValueError(
|
||||
"delete_dir_contents called on path '", path, "'")
|
||||
self._delete_dir_contents(path, missing_dir_ok)
|
||||
|
||||
def delete_root_dir_contents(self):
|
||||
self._delete_dir_contents("/")
|
||||
|
||||
def delete_file(self, path):
|
||||
# fs.rm correctly raises IsADirectoryError when `path` is a directory
|
||||
# instead of a file and `recursive` is not set to True
|
||||
if not self.fs.exists(path):
|
||||
raise FileNotFoundError(path)
|
||||
self.fs.rm(path)
|
||||
|
||||
def move(self, src, dest):
|
||||
self.fs.mv(src, dest, recursive=True)
|
||||
|
||||
def copy_file(self, src, dest):
|
||||
# fs.copy correctly raises IsADirectoryError when `src` is a directory
|
||||
# instead of a file
|
||||
self.fs.copy(src, dest)
|
||||
|
||||
# TODO can we read/pass metadata (e.g. Content-Type) in the methods below?
|
||||
|
||||
def open_input_stream(self, path):
|
||||
from pyarrow import PythonFile
|
||||
|
||||
if not self.fs.isfile(path):
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
|
||||
|
||||
def open_input_file(self, path):
|
||||
from pyarrow import PythonFile
|
||||
|
||||
if not self.fs.isfile(path):
|
||||
raise FileNotFoundError(path)
|
||||
|
||||
return PythonFile(self.fs.open(path, mode="rb"), mode="r")
|
||||
|
||||
def open_output_stream(self, path, metadata):
|
||||
from pyarrow import PythonFile
|
||||
|
||||
return PythonFile(self.fs.open(path, mode="wb"), mode="w")
|
||||
|
||||
def open_append_stream(self, path, metadata):
|
||||
from pyarrow import PythonFile
|
||||
|
||||
return PythonFile(self.fs.open(path, mode="ab"), mode="w")
|
||||
579
venv/lib/python3.9/site-packages/pyarrow/gandiva.pyx
Normal file
579
venv/lib/python3.9/site-packages/pyarrow/gandiva.pyx
Normal file
@@ -0,0 +1,579 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
# cython: profile=False
|
||||
# distutils: language = c++
|
||||
# cython: language_level = 3
|
||||
|
||||
from libcpp cimport bool as c_bool, nullptr
|
||||
from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
|
||||
from libcpp.string cimport string as c_string
|
||||
from libcpp.vector cimport vector as c_vector
|
||||
from libcpp.unordered_set cimport unordered_set as c_unordered_set
|
||||
from libc.stdint cimport int64_t, int32_t, uint8_t, uintptr_t
|
||||
|
||||
from pyarrow.includes.libarrow cimport *
|
||||
from pyarrow.lib cimport (Array, DataType, Field, MemoryPool, RecordBatch,
|
||||
Schema, check_status, pyarrow_wrap_array,
|
||||
pyarrow_wrap_data_type, ensure_type, _Weakrefable,
|
||||
pyarrow_wrap_field)
|
||||
from pyarrow.lib import frombytes
|
||||
|
||||
from pyarrow.includes.libgandiva cimport (
|
||||
CCondition, CGandivaExpression,
|
||||
CNode, CProjector, CFilter,
|
||||
CSelectionVector,
|
||||
CSelectionVector_Mode,
|
||||
_ensure_selection_mode,
|
||||
CConfiguration,
|
||||
CConfigurationBuilder,
|
||||
TreeExprBuilder_MakeExpression,
|
||||
TreeExprBuilder_MakeFunction,
|
||||
TreeExprBuilder_MakeBoolLiteral,
|
||||
TreeExprBuilder_MakeUInt8Literal,
|
||||
TreeExprBuilder_MakeUInt16Literal,
|
||||
TreeExprBuilder_MakeUInt32Literal,
|
||||
TreeExprBuilder_MakeUInt64Literal,
|
||||
TreeExprBuilder_MakeInt8Literal,
|
||||
TreeExprBuilder_MakeInt16Literal,
|
||||
TreeExprBuilder_MakeInt32Literal,
|
||||
TreeExprBuilder_MakeInt64Literal,
|
||||
TreeExprBuilder_MakeFloatLiteral,
|
||||
TreeExprBuilder_MakeDoubleLiteral,
|
||||
TreeExprBuilder_MakeStringLiteral,
|
||||
TreeExprBuilder_MakeBinaryLiteral,
|
||||
TreeExprBuilder_MakeField,
|
||||
TreeExprBuilder_MakeIf,
|
||||
TreeExprBuilder_MakeAnd,
|
||||
TreeExprBuilder_MakeOr,
|
||||
TreeExprBuilder_MakeCondition,
|
||||
TreeExprBuilder_MakeInExpressionInt32,
|
||||
TreeExprBuilder_MakeInExpressionInt64,
|
||||
TreeExprBuilder_MakeInExpressionTime32,
|
||||
TreeExprBuilder_MakeInExpressionTime64,
|
||||
TreeExprBuilder_MakeInExpressionDate32,
|
||||
TreeExprBuilder_MakeInExpressionDate64,
|
||||
TreeExprBuilder_MakeInExpressionTimeStamp,
|
||||
TreeExprBuilder_MakeInExpressionString,
|
||||
TreeExprBuilder_MakeInExpressionBinary,
|
||||
SelectionVector_MakeInt16,
|
||||
SelectionVector_MakeInt32,
|
||||
SelectionVector_MakeInt64,
|
||||
Projector_Make,
|
||||
Filter_Make,
|
||||
CFunctionSignature,
|
||||
GetRegisteredFunctionSignatures)
|
||||
|
||||
|
||||
cdef class Node(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CNode] node
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly, use the "
|
||||
"TreeExprBuilder API directly"
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CNode] node):
|
||||
cdef Node self = Node.__new__(Node)
|
||||
self.node = node
|
||||
return self
|
||||
|
||||
def __str__(self):
|
||||
return self.node.get().ToString().decode()
|
||||
|
||||
def __repr__(self):
|
||||
type_format = object.__repr__(self)
|
||||
return '{0}\n{1}'.format(type_format, str(self))
|
||||
|
||||
def return_type(self):
|
||||
return pyarrow_wrap_data_type(self.node.get().return_type())
|
||||
|
||||
|
||||
cdef class Expression(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CGandivaExpression] expression
|
||||
|
||||
cdef void init(self, shared_ptr[CGandivaExpression] expression):
|
||||
self.expression = expression
|
||||
|
||||
def __str__(self):
|
||||
return self.expression.get().ToString().decode()
|
||||
|
||||
def __repr__(self):
|
||||
type_format = object.__repr__(self)
|
||||
return '{0}\n{1}'.format(type_format, str(self))
|
||||
|
||||
def root(self):
|
||||
return Node.create(self.expression.get().root())
|
||||
|
||||
def result(self):
|
||||
return pyarrow_wrap_field(self.expression.get().result())
|
||||
|
||||
|
||||
cdef class Condition(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CCondition] condition
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly, use the "
|
||||
"TreeExprBuilder API instead"
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CCondition] condition):
|
||||
cdef Condition self = Condition.__new__(Condition)
|
||||
self.condition = condition
|
||||
return self
|
||||
|
||||
def __str__(self):
|
||||
return self.condition.get().ToString().decode()
|
||||
|
||||
def __repr__(self):
|
||||
type_format = object.__repr__(self)
|
||||
return '{0}\n{1}'.format(type_format, str(self))
|
||||
|
||||
def root(self):
|
||||
return Node.create(self.condition.get().root())
|
||||
|
||||
def result(self):
|
||||
return pyarrow_wrap_field(self.condition.get().result())
|
||||
|
||||
|
||||
cdef class SelectionVector(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CSelectionVector] selection_vector
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly."
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CSelectionVector] selection_vector):
|
||||
cdef SelectionVector self = SelectionVector.__new__(SelectionVector)
|
||||
self.selection_vector = selection_vector
|
||||
return self
|
||||
|
||||
def to_array(self):
|
||||
cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
|
||||
return pyarrow_wrap_array(result)
|
||||
|
||||
|
||||
cdef class Projector(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CProjector] projector
|
||||
MemoryPool pool
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly, use "
|
||||
"make_projector instead"
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CProjector] projector, MemoryPool pool):
|
||||
cdef Projector self = Projector.__new__(Projector)
|
||||
self.projector = projector
|
||||
self.pool = pool
|
||||
return self
|
||||
|
||||
@property
|
||||
def llvm_ir(self):
|
||||
return self.projector.get().DumpIR().decode()
|
||||
|
||||
def evaluate(self, RecordBatch batch, SelectionVector selection=None):
|
||||
cdef vector[shared_ptr[CArray]] results
|
||||
if selection is None:
|
||||
check_status(self.projector.get().Evaluate(
|
||||
batch.sp_batch.get()[0], self.pool.pool, &results))
|
||||
else:
|
||||
check_status(
|
||||
self.projector.get().Evaluate(
|
||||
batch.sp_batch.get()[0], selection.selection_vector.get(),
|
||||
self.pool.pool, &results))
|
||||
cdef shared_ptr[CArray] result
|
||||
arrays = []
|
||||
for result in results:
|
||||
arrays.append(pyarrow_wrap_array(result))
|
||||
return arrays
|
||||
|
||||
|
||||
cdef class Filter(_Weakrefable):
|
||||
cdef:
|
||||
shared_ptr[CFilter] filter
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly, use "
|
||||
"make_filter instead"
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CFilter] filter):
|
||||
cdef Filter self = Filter.__new__(Filter)
|
||||
self.filter = filter
|
||||
return self
|
||||
|
||||
@property
|
||||
def llvm_ir(self):
|
||||
return self.filter.get().DumpIR().decode()
|
||||
|
||||
def evaluate(self, RecordBatch batch, MemoryPool pool, dtype='int32'):
|
||||
cdef:
|
||||
DataType type = ensure_type(dtype)
|
||||
shared_ptr[CSelectionVector] selection
|
||||
|
||||
if type.id == _Type_INT16:
|
||||
check_status(SelectionVector_MakeInt16(
|
||||
batch.num_rows, pool.pool, &selection))
|
||||
elif type.id == _Type_INT32:
|
||||
check_status(SelectionVector_MakeInt32(
|
||||
batch.num_rows, pool.pool, &selection))
|
||||
elif type.id == _Type_INT64:
|
||||
check_status(SelectionVector_MakeInt64(
|
||||
batch.num_rows, pool.pool, &selection))
|
||||
else:
|
||||
raise ValueError("'dtype' of the selection vector should be "
|
||||
"one of 'int16', 'int32' and 'int64'.")
|
||||
|
||||
check_status(self.filter.get().Evaluate(
|
||||
batch.sp_batch.get()[0], selection))
|
||||
return SelectionVector.create(selection)
|
||||
|
||||
|
||||
cdef class TreeExprBuilder(_Weakrefable):
|
||||
|
||||
def make_literal(self, value, dtype):
|
||||
cdef:
|
||||
DataType type = ensure_type(dtype)
|
||||
shared_ptr[CNode] r
|
||||
|
||||
if type.id == _Type_BOOL:
|
||||
r = TreeExprBuilder_MakeBoolLiteral(value)
|
||||
elif type.id == _Type_UINT8:
|
||||
r = TreeExprBuilder_MakeUInt8Literal(value)
|
||||
elif type.id == _Type_UINT16:
|
||||
r = TreeExprBuilder_MakeUInt16Literal(value)
|
||||
elif type.id == _Type_UINT32:
|
||||
r = TreeExprBuilder_MakeUInt32Literal(value)
|
||||
elif type.id == _Type_UINT64:
|
||||
r = TreeExprBuilder_MakeUInt64Literal(value)
|
||||
elif type.id == _Type_INT8:
|
||||
r = TreeExprBuilder_MakeInt8Literal(value)
|
||||
elif type.id == _Type_INT16:
|
||||
r = TreeExprBuilder_MakeInt16Literal(value)
|
||||
elif type.id == _Type_INT32:
|
||||
r = TreeExprBuilder_MakeInt32Literal(value)
|
||||
elif type.id == _Type_INT64:
|
||||
r = TreeExprBuilder_MakeInt64Literal(value)
|
||||
elif type.id == _Type_FLOAT:
|
||||
r = TreeExprBuilder_MakeFloatLiteral(value)
|
||||
elif type.id == _Type_DOUBLE:
|
||||
r = TreeExprBuilder_MakeDoubleLiteral(value)
|
||||
elif type.id == _Type_STRING:
|
||||
r = TreeExprBuilder_MakeStringLiteral(value.encode('UTF-8'))
|
||||
elif type.id == _Type_BINARY:
|
||||
r = TreeExprBuilder_MakeBinaryLiteral(value)
|
||||
else:
|
||||
raise TypeError("Didn't recognize dtype " + str(dtype))
|
||||
|
||||
return Node.create(r)
|
||||
|
||||
def make_expression(self, Node root_node not None,
|
||||
Field return_field not None):
|
||||
cdef shared_ptr[CGandivaExpression] r = TreeExprBuilder_MakeExpression(
|
||||
root_node.node, return_field.sp_field)
|
||||
cdef Expression expression = Expression()
|
||||
expression.init(r)
|
||||
return expression
|
||||
|
||||
def make_function(self, name, children, DataType return_type):
|
||||
cdef c_vector[shared_ptr[CNode]] c_children
|
||||
cdef Node child
|
||||
for child in children:
|
||||
if child is None:
|
||||
raise TypeError("Child nodes must not be None")
|
||||
c_children.push_back(child.node)
|
||||
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeFunction(
|
||||
name.encode(), c_children, return_type.sp_type)
|
||||
return Node.create(r)
|
||||
|
||||
def make_field(self, Field field not None):
|
||||
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeField(field.sp_field)
|
||||
return Node.create(r)
|
||||
|
||||
def make_if(self, Node condition not None, Node this_node not None,
|
||||
Node else_node not None, DataType return_type not None):
|
||||
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeIf(
|
||||
condition.node, this_node.node, else_node.node,
|
||||
return_type.sp_type)
|
||||
return Node.create(r)
|
||||
|
||||
def make_and(self, children):
|
||||
cdef c_vector[shared_ptr[CNode]] c_children
|
||||
cdef Node child
|
||||
for child in children:
|
||||
if child is None:
|
||||
raise TypeError("Child nodes must not be None")
|
||||
c_children.push_back(child.node)
|
||||
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeAnd(c_children)
|
||||
return Node.create(r)
|
||||
|
||||
def make_or(self, children):
|
||||
cdef c_vector[shared_ptr[CNode]] c_children
|
||||
cdef Node child
|
||||
for child in children:
|
||||
if child is None:
|
||||
raise TypeError("Child nodes must not be None")
|
||||
c_children.push_back(child.node)
|
||||
cdef shared_ptr[CNode] r = TreeExprBuilder_MakeOr(c_children)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_int32(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int32_t] c_values
|
||||
cdef int32_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionInt32(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_int64(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int64_t] c_values
|
||||
cdef int64_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionInt64(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_time32(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int32_t] c_values
|
||||
cdef int32_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionTime32(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_time64(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int64_t] c_values
|
||||
cdef int64_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionTime64(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_date32(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int32_t] c_values
|
||||
cdef int32_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionDate32(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_date64(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int64_t] c_values
|
||||
cdef int64_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionDate64(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_timestamp(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[int64_t] c_values
|
||||
cdef int64_t v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionTimeStamp(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_binary(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[c_string] c_values
|
||||
cdef c_string v
|
||||
for v in values:
|
||||
c_values.insert(v)
|
||||
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def _make_in_expression_string(self, Node node not None, values):
|
||||
cdef shared_ptr[CNode] r
|
||||
cdef c_unordered_set[c_string] c_values
|
||||
cdef c_string _v
|
||||
for v in values:
|
||||
_v = v.encode('UTF-8')
|
||||
c_values.insert(_v)
|
||||
r = TreeExprBuilder_MakeInExpressionString(node.node, c_values)
|
||||
return Node.create(r)
|
||||
|
||||
def make_in_expression(self, Node node not None, values, dtype):
|
||||
cdef DataType type = ensure_type(dtype)
|
||||
|
||||
if type.id == _Type_INT32:
|
||||
return self._make_in_expression_int32(node, values)
|
||||
elif type.id == _Type_INT64:
|
||||
return self._make_in_expression_int64(node, values)
|
||||
elif type.id == _Type_TIME32:
|
||||
return self._make_in_expression_time32(node, values)
|
||||
elif type.id == _Type_TIME64:
|
||||
return self._make_in_expression_time64(node, values)
|
||||
elif type.id == _Type_TIMESTAMP:
|
||||
return self._make_in_expression_timestamp(node, values)
|
||||
elif type.id == _Type_DATE32:
|
||||
return self._make_in_expression_date32(node, values)
|
||||
elif type.id == _Type_DATE64:
|
||||
return self._make_in_expression_date64(node, values)
|
||||
elif type.id == _Type_BINARY:
|
||||
return self._make_in_expression_binary(node, values)
|
||||
elif type.id == _Type_STRING:
|
||||
return self._make_in_expression_string(node, values)
|
||||
else:
|
||||
raise TypeError("Data type " + str(dtype) + " not supported.")
|
||||
|
||||
def make_condition(self, Node condition not None):
|
||||
cdef shared_ptr[CCondition] r = TreeExprBuilder_MakeCondition(
|
||||
condition.node)
|
||||
return Condition.create(r)
|
||||
|
||||
|
||||
cpdef make_projector(Schema schema, children, MemoryPool pool,
|
||||
str selection_mode="NONE"):
|
||||
"""
|
||||
Construct a projection using expressions.
|
||||
|
||||
A projector is built for a specific schema and vector of expressions.
|
||||
Once the projector is built, it can be used to evaluate many row batches.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
schema : pyarrow.Schema
|
||||
Schema for the record batches, and the expressions.
|
||||
children : list[pyarrow.gandiva.Expression]
|
||||
List of projectable expression objects.
|
||||
pool : pyarrow.MemoryPool
|
||||
Memory pool used to allocate output arrays.
|
||||
selection_mode : str, default "NONE"
|
||||
Possible values are NONE, UINT16, UINT32, UINT64.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Projector instance
|
||||
"""
|
||||
cdef:
|
||||
Expression child
|
||||
c_vector[shared_ptr[CGandivaExpression]] c_children
|
||||
shared_ptr[CProjector] result
|
||||
|
||||
for child in children:
|
||||
if child is None:
|
||||
raise TypeError("Expressions must not be None")
|
||||
c_children.push_back(child.expression)
|
||||
|
||||
check_status(
|
||||
Projector_Make(schema.sp_schema, c_children,
|
||||
_ensure_selection_mode(selection_mode),
|
||||
CConfigurationBuilder.DefaultConfiguration(),
|
||||
&result))
|
||||
return Projector.create(result, pool)
|
||||
|
||||
|
||||
cpdef make_filter(Schema schema, Condition condition):
|
||||
"""
|
||||
Construct a filter based on a condition.
|
||||
|
||||
A filter is built for a specific schema and condition. Once the filter is
|
||||
built, it can be used to evaluate many row batches.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
schema : pyarrow.Schema
|
||||
Schema for the record batches, and the condition.
|
||||
condition : pyarrow.gandiva.Condition
|
||||
Filter condition.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Filter instance
|
||||
"""
|
||||
cdef shared_ptr[CFilter] result
|
||||
if condition is None:
|
||||
raise TypeError("Condition must not be None")
|
||||
check_status(
|
||||
Filter_Make(schema.sp_schema, condition.condition, &result))
|
||||
return Filter.create(result)
|
||||
|
||||
|
||||
cdef class FunctionSignature(_Weakrefable):
|
||||
"""
|
||||
Signature of a Gandiva function including name, parameter types
|
||||
and return type.
|
||||
"""
|
||||
|
||||
cdef:
|
||||
shared_ptr[CFunctionSignature] signature
|
||||
|
||||
def __init__(self):
|
||||
raise TypeError("Do not call {}'s constructor directly."
|
||||
.format(self.__class__.__name__))
|
||||
|
||||
@staticmethod
|
||||
cdef create(shared_ptr[CFunctionSignature] signature):
|
||||
cdef FunctionSignature self = FunctionSignature.__new__(
|
||||
FunctionSignature)
|
||||
self.signature = signature
|
||||
return self
|
||||
|
||||
def return_type(self):
|
||||
return pyarrow_wrap_data_type(self.signature.get().ret_type())
|
||||
|
||||
def param_types(self):
|
||||
result = []
|
||||
cdef vector[shared_ptr[CDataType]] types = \
|
||||
self.signature.get().param_types()
|
||||
for t in types:
|
||||
result.append(pyarrow_wrap_data_type(t))
|
||||
return result
|
||||
|
||||
def name(self):
|
||||
return self.signature.get().base_name().decode()
|
||||
|
||||
def __repr__(self):
|
||||
signature = self.signature.get().ToString().decode()
|
||||
return "FunctionSignature(" + signature + ")"
|
||||
|
||||
|
||||
def get_registered_function_signatures():
|
||||
"""
|
||||
Return the function in Gandiva's ExpressionRegistry.
|
||||
|
||||
Returns
|
||||
-------
|
||||
registry: a list of registered function signatures
|
||||
"""
|
||||
results = []
|
||||
|
||||
cdef vector[shared_ptr[CFunctionSignature]] signatures = \
|
||||
GetRegisteredFunctionSignatures()
|
||||
|
||||
for signature in signatures:
|
||||
results.append(FunctionSignature.create(signature))
|
||||
|
||||
return results
|
||||
240
venv/lib/python3.9/site-packages/pyarrow/hdfs.py
Normal file
240
venv/lib/python3.9/site-packages/pyarrow/hdfs.py
Normal file
@@ -0,0 +1,240 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
import os
|
||||
import posixpath
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
from pyarrow.util import implements, _DEPR_MSG
|
||||
from pyarrow.filesystem import FileSystem
|
||||
import pyarrow._hdfsio as _hdfsio
|
||||
|
||||
|
||||
class HadoopFileSystem(_hdfsio.HadoopFileSystem, FileSystem):
|
||||
"""
|
||||
DEPRECATED: FileSystem interface for HDFS cluster.
|
||||
|
||||
See pyarrow.hdfs.connect for full connection details
|
||||
|
||||
.. deprecated:: 2.0
|
||||
``pyarrow.hdfs.HadoopFileSystem`` is deprecated,
|
||||
please use ``pyarrow.fs.HadoopFileSystem`` instead.
|
||||
"""
|
||||
|
||||
def __init__(self, host="default", port=0, user=None, kerb_ticket=None,
|
||||
driver='libhdfs', extra_conf=None):
|
||||
warnings.warn(
|
||||
_DEPR_MSG.format(
|
||||
"hdfs.HadoopFileSystem", "2.0.0", "fs.HadoopFileSystem"),
|
||||
FutureWarning, stacklevel=2)
|
||||
if driver == 'libhdfs':
|
||||
_maybe_set_hadoop_classpath()
|
||||
|
||||
self._connect(host, port, user, kerb_ticket, extra_conf)
|
||||
|
||||
def __reduce__(self):
|
||||
return (HadoopFileSystem, (self.host, self.port, self.user,
|
||||
self.kerb_ticket, self.extra_conf))
|
||||
|
||||
def _isfilestore(self):
|
||||
"""
|
||||
Return True if this is a Unix-style file store with directories.
|
||||
"""
|
||||
return True
|
||||
|
||||
@implements(FileSystem.isdir)
|
||||
def isdir(self, path):
|
||||
return super().isdir(path)
|
||||
|
||||
@implements(FileSystem.isfile)
|
||||
def isfile(self, path):
|
||||
return super().isfile(path)
|
||||
|
||||
@implements(FileSystem.delete)
|
||||
def delete(self, path, recursive=False):
|
||||
return super().delete(path, recursive)
|
||||
|
||||
def mkdir(self, path, **kwargs):
|
||||
"""
|
||||
Create directory in HDFS.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
Directory path to create, including any parent directories.
|
||||
|
||||
Notes
|
||||
-----
|
||||
libhdfs does not support create_parents=False, so we ignore this here
|
||||
"""
|
||||
return super().mkdir(path)
|
||||
|
||||
@implements(FileSystem.rename)
|
||||
def rename(self, path, new_path):
|
||||
return super().rename(path, new_path)
|
||||
|
||||
@implements(FileSystem.exists)
|
||||
def exists(self, path):
|
||||
return super().exists(path)
|
||||
|
||||
def ls(self, path, detail=False):
|
||||
"""
|
||||
Retrieve directory contents and metadata, if requested.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
path : str
|
||||
HDFS path to retrieve contents of.
|
||||
detail : bool, default False
|
||||
If False, only return list of paths.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : list of dicts (detail=True) or strings (detail=False)
|
||||
"""
|
||||
return super().ls(path, detail)
|
||||
|
||||
def walk(self, top_path):
|
||||
"""
|
||||
Directory tree generator for HDFS, like os.walk.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
top_path : str
|
||||
Root directory for tree traversal.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Generator yielding 3-tuple (dirpath, dirnames, filename)
|
||||
"""
|
||||
contents = self.ls(top_path, detail=True)
|
||||
|
||||
directories, files = _libhdfs_walk_files_dirs(top_path, contents)
|
||||
yield top_path, directories, files
|
||||
for dirname in directories:
|
||||
yield from self.walk(self._path_join(top_path, dirname))
|
||||
|
||||
|
||||
def _maybe_set_hadoop_classpath():
|
||||
import re
|
||||
|
||||
if re.search(r'hadoop-common[^/]+.jar', os.environ.get('CLASSPATH', '')):
|
||||
return
|
||||
|
||||
if 'HADOOP_HOME' in os.environ:
|
||||
if sys.platform != 'win32':
|
||||
classpath = _derive_hadoop_classpath()
|
||||
else:
|
||||
hadoop_bin = '{}/bin/hadoop'.format(os.environ['HADOOP_HOME'])
|
||||
classpath = _hadoop_classpath_glob(hadoop_bin)
|
||||
else:
|
||||
classpath = _hadoop_classpath_glob('hadoop')
|
||||
|
||||
os.environ['CLASSPATH'] = classpath.decode('utf-8')
|
||||
|
||||
|
||||
def _derive_hadoop_classpath():
|
||||
import subprocess
|
||||
|
||||
find_args = ('find', '-L', os.environ['HADOOP_HOME'], '-name', '*.jar')
|
||||
find = subprocess.Popen(find_args, stdout=subprocess.PIPE)
|
||||
xargs_echo = subprocess.Popen(('xargs', 'echo'),
|
||||
stdin=find.stdout,
|
||||
stdout=subprocess.PIPE)
|
||||
jars = subprocess.check_output(('tr', "' '", "':'"),
|
||||
stdin=xargs_echo.stdout)
|
||||
hadoop_conf = os.environ["HADOOP_CONF_DIR"] \
|
||||
if "HADOOP_CONF_DIR" in os.environ \
|
||||
else os.environ["HADOOP_HOME"] + "/etc/hadoop"
|
||||
return (hadoop_conf + ":").encode("utf-8") + jars
|
||||
|
||||
|
||||
def _hadoop_classpath_glob(hadoop_bin):
|
||||
import subprocess
|
||||
|
||||
hadoop_classpath_args = (hadoop_bin, 'classpath', '--glob')
|
||||
return subprocess.check_output(hadoop_classpath_args)
|
||||
|
||||
|
||||
def _libhdfs_walk_files_dirs(top_path, contents):
|
||||
files = []
|
||||
directories = []
|
||||
for c in contents:
|
||||
scrubbed_name = posixpath.split(c['name'])[1]
|
||||
if c['kind'] == 'file':
|
||||
files.append(scrubbed_name)
|
||||
else:
|
||||
directories.append(scrubbed_name)
|
||||
|
||||
return directories, files
|
||||
|
||||
|
||||
def connect(host="default", port=0, user=None, kerb_ticket=None,
|
||||
extra_conf=None):
|
||||
"""
|
||||
DEPRECATED: Connect to an HDFS cluster.
|
||||
|
||||
All parameters are optional and should only be set if the defaults need
|
||||
to be overridden.
|
||||
|
||||
Authentication should be automatic if the HDFS cluster uses Kerberos.
|
||||
However, if a username is specified, then the ticket cache will likely
|
||||
be required.
|
||||
|
||||
.. deprecated:: 2.0
|
||||
``pyarrow.hdfs.connect`` is deprecated,
|
||||
please use ``pyarrow.fs.HadoopFileSystem`` instead.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
host : NameNode. Set to "default" for fs.defaultFS from core-site.xml.
|
||||
port : NameNode's port. Set to 0 for default or logical (HA) nodes.
|
||||
user : Username when connecting to HDFS; None implies login user.
|
||||
kerb_ticket : Path to Kerberos ticket cache.
|
||||
extra_conf : dict, default None
|
||||
extra Key/Value pairs for config; Will override any
|
||||
hdfs-site.xml properties
|
||||
|
||||
Notes
|
||||
-----
|
||||
The first time you call this method, it will take longer than usual due
|
||||
to JNI spin-up time.
|
||||
|
||||
Returns
|
||||
-------
|
||||
filesystem : HadoopFileSystem
|
||||
"""
|
||||
warnings.warn(
|
||||
_DEPR_MSG.format("hdfs.connect", "2.0.0", "fs.HadoopFileSystem"),
|
||||
FutureWarning, stacklevel=2
|
||||
)
|
||||
return _connect(
|
||||
host=host, port=port, user=user, kerb_ticket=kerb_ticket,
|
||||
extra_conf=extra_conf
|
||||
)
|
||||
|
||||
|
||||
def _connect(host="default", port=0, user=None, kerb_ticket=None,
|
||||
extra_conf=None):
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
fs = HadoopFileSystem(host=host, port=port, user=user,
|
||||
kerb_ticket=kerb_ticket,
|
||||
extra_conf=extra_conf)
|
||||
return fs
|
||||
@@ -0,0 +1,323 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/adapters/orc/options.h"
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/record_batch.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace adapters {
|
||||
namespace orc {
|
||||
|
||||
/// \brief Information about an ORC stripe
|
||||
struct StripeInformation {
|
||||
/// \brief Offset of the stripe from the start of the file, in bytes
|
||||
int64_t offset;
|
||||
/// \brief Length of the stripe, in bytes
|
||||
int64_t length;
|
||||
/// \brief Number of rows in the stripe
|
||||
int64_t num_rows;
|
||||
/// \brief Index of the first row of the stripe
|
||||
int64_t first_row_id;
|
||||
};
|
||||
|
||||
/// \class ORCFileReader
|
||||
/// \brief Read an Arrow Table or RecordBatch from an ORC file.
|
||||
class ARROW_EXPORT ORCFileReader {
|
||||
public:
|
||||
~ORCFileReader();
|
||||
|
||||
/// \brief Creates a new ORC reader
|
||||
///
|
||||
/// \param[in] file the data source
|
||||
/// \param[in] pool a MemoryPool to use for buffer allocations
|
||||
/// \return the returned reader object
|
||||
static Result<std::unique_ptr<ORCFileReader>> Open(
|
||||
const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);
|
||||
|
||||
/// \brief Return the schema read from the ORC file
|
||||
///
|
||||
/// \return the returned Schema object
|
||||
Result<std::shared_ptr<Schema>> ReadSchema();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read();
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Read the file as a Table
|
||||
///
|
||||
/// The table will be composed of one record batch per stripe.
|
||||
///
|
||||
/// \param[in] schema the Table schema
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned Table
|
||||
Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
|
||||
const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Read a single stripe as a RecordBatch
|
||||
///
|
||||
/// \param[in] stripe the stripe index
|
||||
/// \param[in] include_names the selected field names to read
|
||||
/// \return the returned RecordBatch
|
||||
Result<std::shared_ptr<RecordBatch>> ReadStripe(
|
||||
int64_t stripe, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief Seek to designated row. Invoke NextStripeReader() after seek
|
||||
/// will return stripe reader starting from designated row.
|
||||
///
|
||||
/// \param[in] row_number the rows number to seek
|
||||
Status Seek(int64_t row_number);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \return the returned stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);
|
||||
|
||||
/// \brief Get a stripe level record batch iterator.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
/// NextStripeReader serves as a fine grained alternative to ReadStripe
|
||||
/// which may cause OOM issues by loading the whole stripe into memory.
|
||||
///
|
||||
/// Note this will only read rows for the current stripe, not the entire
|
||||
/// file.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_indices the selected field indices to read
|
||||
/// \return the stripe reader
|
||||
Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
|
||||
int64_t batch_size, const std::vector<int>& include_indices);
|
||||
|
||||
/// \brief Get a record batch iterator for the entire file.
|
||||
///
|
||||
/// Each record batch will have up to `batch_size` rows.
|
||||
///
|
||||
/// \param[in] batch_size the maximum number of rows in each record batch
|
||||
/// \param[in] include_names the selected field names to read, if not empty
|
||||
/// (otherwise all fields are read)
|
||||
/// \return the record batch iterator
|
||||
Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
|
||||
int64_t batch_size, const std::vector<std::string>& include_names);
|
||||
|
||||
/// \brief The number of stripes in the file
|
||||
int64_t NumberOfStripes();
|
||||
|
||||
/// \brief The number of rows in the file
|
||||
int64_t NumberOfRows();
|
||||
|
||||
/// \brief StripeInformation for each stripe.
|
||||
StripeInformation GetStripeInformation(int64_t stripe);
|
||||
|
||||
/// \brief Get the format version of the file.
|
||||
/// Currently known values are 0.11 and 0.12.
|
||||
///
|
||||
/// \return The FileVersion of the ORC file.
|
||||
FileVersion GetFileVersion();
|
||||
|
||||
/// \brief Get the software instance and version that wrote this file.
|
||||
///
|
||||
/// \return a user-facing string that specifies the software version
|
||||
std::string GetSoftwareVersion();
|
||||
|
||||
/// \brief Get the compression kind of the file.
|
||||
///
|
||||
/// \return The kind of compression in the ORC file.
|
||||
Result<Compression::type> GetCompression();
|
||||
|
||||
/// \brief Get the buffer size for the compression.
|
||||
///
|
||||
/// \return Number of bytes to buffer for the compression codec.
|
||||
int64_t GetCompressionSize();
|
||||
|
||||
/// \brief Get the number of rows per an entry in the row index.
|
||||
/// \return the number of rows per an entry in the row index or 0 if there
|
||||
/// is no row index.
|
||||
int64_t GetRowIndexStride();
|
||||
|
||||
/// \brief Get ID of writer that generated the file.
|
||||
///
|
||||
/// \return UNKNOWN_WRITER if the writer ID is undefined
|
||||
WriterId GetWriterId();
|
||||
|
||||
/// \brief Get the writer id value when getWriterId() returns an unknown writer.
|
||||
///
|
||||
/// \return the integer value of the writer ID.
|
||||
int32_t GetWriterIdValue();
|
||||
|
||||
/// \brief Get the version of the writer.
|
||||
///
|
||||
/// \return the version of the writer.
|
||||
|
||||
WriterVersion GetWriterVersion();
|
||||
|
||||
/// \brief Get the number of stripe statistics in the file.
|
||||
///
|
||||
/// \return the number of stripe statistics
|
||||
int64_t GetNumberOfStripeStatistics();
|
||||
|
||||
/// \brief Get the length of the data stripes in the file.
|
||||
///
|
||||
/// \return return the number of bytes in stripes
|
||||
int64_t GetContentLength();
|
||||
|
||||
/// \brief Get the length of the file stripe statistics.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file stripe statistics
|
||||
int64_t GetStripeStatisticsLength();
|
||||
|
||||
/// \brief Get the length of the file footer.
|
||||
///
|
||||
/// \return the number of compressed bytes in the file footer
|
||||
int64_t GetFileFooterLength();
|
||||
|
||||
/// \brief Get the length of the file postscript.
|
||||
///
|
||||
/// \return the number of bytes in the file postscript
|
||||
int64_t GetFilePostscriptLength();
|
||||
|
||||
/// \brief Get the total length of the file.
|
||||
///
|
||||
/// \return the number of bytes in the file
|
||||
int64_t GetFileLength();
|
||||
|
||||
/// \brief Get the serialized file tail.
|
||||
/// Usefull if another reader of the same file wants to avoid re-reading
|
||||
/// the file tail. See ReadOptions.SetSerializedFileTail().
|
||||
///
|
||||
/// \return a string of bytes with the file tail
|
||||
std::string GetSerializedFileTail();
|
||||
|
||||
/// \brief Return the metadata read from the ORC file
|
||||
///
|
||||
/// \return A KeyValueMetadata object containing the ORC metadata
|
||||
Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
ORCFileReader();
|
||||
};
|
||||
|
||||
/// \class ORCFileWriter
|
||||
/// \brief Write an Arrow Table or RecordBatch to an ORC file.
|
||||
class ARROW_EXPORT ORCFileWriter {
|
||||
public:
|
||||
~ORCFileWriter();
|
||||
/// \brief Creates a new ORC writer.
|
||||
///
|
||||
/// \param[in] output_stream a pointer to the io::OutputStream to write into
|
||||
/// \param[in] write_options the ORC writer options for Arrow
|
||||
/// \return the returned writer object
|
||||
static Result<std::unique_ptr<ORCFileWriter>> Open(
|
||||
io::OutputStream* output_stream,
|
||||
const WriteOptions& write_options = WriteOptions());
|
||||
|
||||
/// \brief Write a table. This can be called multiple times.
|
||||
///
|
||||
/// Tables passed in subsequent calls must match the schema of the table that was
|
||||
/// written first.
|
||||
///
|
||||
/// \param[in] table the Arrow table from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const Table& table);
|
||||
|
||||
/// \brief Write a RecordBatch. This can be called multiple times.
|
||||
///
|
||||
/// RecordBatches passed in subsequent calls must match the schema of the
|
||||
/// RecordBatch that was written first.
|
||||
///
|
||||
/// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
|
||||
/// \return Status
|
||||
Status Write(const RecordBatch& record_batch);
|
||||
|
||||
/// \brief Close an ORC writer (orc::Writer)
|
||||
///
|
||||
/// \return Status
|
||||
Status Close();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
std::unique_ptr<Impl> impl_;
|
||||
|
||||
private:
|
||||
ORCFileWriter();
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,120 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/io/interfaces.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace orc {
|
||||
|
||||
enum class WriterId : int32_t {
|
||||
kOrcJava = 0,
|
||||
kOrcCpp = 1,
|
||||
kPresto = 2,
|
||||
kScritchleyGo = 3,
|
||||
kTrino = 4,
|
||||
kUnknown = INT32_MAX
|
||||
};
|
||||
|
||||
enum class WriterVersion : int32_t {
|
||||
kOriginal = 0,
|
||||
kHive8732 = 1,
|
||||
kHive4243 = 2,
|
||||
kHive12055 = 3,
|
||||
kHive13083 = 4,
|
||||
kOrc101 = 5,
|
||||
kOrc135 = 6,
|
||||
kOrc517 = 7,
|
||||
kOrc203 = 8,
|
||||
kOrc14 = 9,
|
||||
kMax = INT32_MAX
|
||||
};
|
||||
|
||||
enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
|
||||
|
||||
class ARROW_EXPORT FileVersion {
|
||||
private:
|
||||
int32_t major_version_;
|
||||
int32_t minor_version_;
|
||||
|
||||
public:
|
||||
static const FileVersion& v_0_11();
|
||||
static const FileVersion& v_0_12();
|
||||
|
||||
FileVersion(int32_t major, int32_t minor)
|
||||
: major_version_(major), minor_version_(minor) {}
|
||||
|
||||
/**
|
||||
* Get major version
|
||||
*/
|
||||
int32_t major_version() const { return this->major_version_; }
|
||||
|
||||
/**
|
||||
* Get minor version
|
||||
*/
|
||||
int32_t minor_version() const { return this->minor_version_; }
|
||||
|
||||
bool operator==(const FileVersion& right) const {
|
||||
return this->major_version() == right.major_version() &&
|
||||
this->minor_version() == right.minor_version();
|
||||
}
|
||||
|
||||
bool operator!=(const FileVersion& right) const { return !(*this == right); }
|
||||
|
||||
std::string ToString() const;
|
||||
};
|
||||
|
||||
/// Options for the ORC Writer
|
||||
struct ARROW_EXPORT WriteOptions {
|
||||
/// Number of rows the ORC writer writes at a time, default 1024
|
||||
int64_t batch_size = 1024;
|
||||
/// Which ORC file version to use, default FileVersion(0, 12)
|
||||
FileVersion file_version = FileVersion(0, 12);
|
||||
/// Size of each ORC stripe in bytes, default 64 MiB
|
||||
int64_t stripe_size = 64 * 1024 * 1024;
|
||||
/// The compression codec of the ORC file, there is no compression by default
|
||||
Compression::type compression = Compression::UNCOMPRESSED;
|
||||
/// The size of each compression block in bytes, default 64 KiB
|
||||
int64_t compression_block_size = 64 * 1024;
|
||||
/// The compression strategy i.e. speed vs size reduction, default
|
||||
/// CompressionStrategy::kSpeed
|
||||
CompressionStrategy compression_strategy = CompressionStrategy::kSpeed;
|
||||
/// The number of rows per an entry in the row index, default 10000
|
||||
int64_t row_index_stride = 10000;
|
||||
/// The padding tolerance, default 0.0
|
||||
double padding_tolerance = 0.0;
|
||||
/// The dictionary key size threshold. 0 to disable dictionary encoding.
|
||||
/// 1 to always enable dictionary encoding, default 0.0
|
||||
double dictionary_key_size_threshold = 0.0;
|
||||
/// The array of columns that use the bloom filter, default empty
|
||||
std::vector<int64_t> bloom_filter_columns;
|
||||
/// The upper limit of the false-positive rate of the bloom filter, default 0.05
|
||||
double bloom_filter_fpp = 0.05;
|
||||
};
|
||||
|
||||
} // namespace orc
|
||||
} // namespace adapters
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,128 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
|
||||
#include "arrow/type.h"
|
||||
|
||||
// These utilities are supposed to be included in TensorFlow operators
|
||||
// that need to be compiled separately from Arrow because of ABI issues.
|
||||
// They therefore need to be header-only.
|
||||
|
||||
namespace arrow {
|
||||
|
||||
namespace adapters {
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
|
||||
switch (dtype) {
|
||||
case ::tensorflow::DT_BOOL:
|
||||
*out = arrow::boolean();
|
||||
break;
|
||||
case ::tensorflow::DT_FLOAT:
|
||||
*out = arrow::float32();
|
||||
break;
|
||||
case ::tensorflow::DT_DOUBLE:
|
||||
*out = arrow::float64();
|
||||
break;
|
||||
case ::tensorflow::DT_HALF:
|
||||
*out = arrow::float16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT8:
|
||||
*out = arrow::int8();
|
||||
break;
|
||||
case ::tensorflow::DT_INT16:
|
||||
*out = arrow::int16();
|
||||
break;
|
||||
case ::tensorflow::DT_INT32:
|
||||
*out = arrow::int32();
|
||||
break;
|
||||
case ::tensorflow::DT_INT64:
|
||||
*out = arrow::int64();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT8:
|
||||
*out = arrow::uint8();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT16:
|
||||
*out = arrow::uint16();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT32:
|
||||
*out = arrow::uint32();
|
||||
break;
|
||||
case ::tensorflow::DT_UINT64:
|
||||
*out = arrow::uint64();
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("TensorFlow data type is not supported");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
|
||||
switch (dtype->id()) {
|
||||
case Type::BOOL:
|
||||
*out = ::tensorflow::DT_BOOL;
|
||||
break;
|
||||
case Type::UINT8:
|
||||
*out = ::tensorflow::DT_UINT8;
|
||||
break;
|
||||
case Type::INT8:
|
||||
*out = ::tensorflow::DT_INT8;
|
||||
break;
|
||||
case Type::UINT16:
|
||||
*out = ::tensorflow::DT_UINT16;
|
||||
break;
|
||||
case Type::INT16:
|
||||
*out = ::tensorflow::DT_INT16;
|
||||
break;
|
||||
case Type::UINT32:
|
||||
*out = ::tensorflow::DT_UINT32;
|
||||
break;
|
||||
case Type::INT32:
|
||||
*out = ::tensorflow::DT_INT32;
|
||||
break;
|
||||
case Type::UINT64:
|
||||
*out = ::tensorflow::DT_UINT64;
|
||||
break;
|
||||
case Type::INT64:
|
||||
*out = ::tensorflow::DT_INT64;
|
||||
break;
|
||||
case Type::HALF_FLOAT:
|
||||
*out = ::tensorflow::DT_HALF;
|
||||
break;
|
||||
case Type::FLOAT:
|
||||
*out = ::tensorflow::DT_FLOAT;
|
||||
break;
|
||||
case Type::DOUBLE:
|
||||
*out = ::tensorflow::DT_DOUBLE;
|
||||
break;
|
||||
default:
|
||||
return Status::TypeError("Arrow data type is not supported");
|
||||
}
|
||||
return arrow::Status::OK();
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
} // namespace adapters
|
||||
|
||||
} // namespace arrow
|
||||
46
venv/lib/python3.9/site-packages/pyarrow/include/arrow/api.h
Normal file
46
venv/lib/python3.9/site-packages/pyarrow/include/arrow/api.h
Normal file
@@ -0,0 +1,46 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Coarse public API while the library is in development
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/array.h" // IYWU pragma: export
|
||||
#include "arrow/array/concatenate.h" // IYWU pragma: export
|
||||
#include "arrow/buffer.h" // IYWU pragma: export
|
||||
#include "arrow/builder.h" // IYWU pragma: export
|
||||
#include "arrow/chunked_array.h" // IYWU pragma: export
|
||||
#include "arrow/compare.h" // IYWU pragma: export
|
||||
#include "arrow/config.h" // IYWU pragma: export
|
||||
#include "arrow/datum.h" // IYWU pragma: export
|
||||
#include "arrow/extension_type.h" // IYWU pragma: export
|
||||
#include "arrow/memory_pool.h" // IYWU pragma: export
|
||||
#include "arrow/pretty_print.h" // IYWU pragma: export
|
||||
#include "arrow/record_batch.h" // IYWU pragma: export
|
||||
#include "arrow/result.h" // IYWU pragma: export
|
||||
#include "arrow/status.h" // IYWU pragma: export
|
||||
#include "arrow/table.h" // IYWU pragma: export
|
||||
#include "arrow/table_builder.h" // IYWU pragma: export
|
||||
#include "arrow/tensor.h" // IYWU pragma: export
|
||||
#include "arrow/type.h" // IYWU pragma: export
|
||||
#include "arrow/util/key_value_metadata.h" // IWYU pragma: export
|
||||
#include "arrow/visit_array_inline.h" // IYWU pragma: export
|
||||
#include "arrow/visit_scalar_inline.h" // IYWU pragma: export
|
||||
#include "arrow/visitor.h" // IYWU pragma: export
|
||||
|
||||
/// \brief Top-level namespace for Apache Arrow C++ API
|
||||
namespace arrow {}
|
||||
@@ -0,0 +1,44 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Kitchen-sink public API for arrow::Array data structures. C++ library code
|
||||
// (especially header files) in Apache Arrow should use more specific headers
|
||||
// unless it's a file that uses most or all Array types in which case using
|
||||
// arrow/array.h is fine.
|
||||
|
||||
#pragma once
|
||||
|
||||
/// \defgroup numeric-arrays Concrete classes for numeric arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-arrays Concrete classes for binary/string arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-arrays Concrete classes for nested arrays
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
#include "arrow/array/array_base.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_binary.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_decimal.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_dict.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_nested.h" // IWYU pragma: keep
|
||||
#include "arrow/array/array_primitive.h" // IWYU pragma: keep
|
||||
#include "arrow/array/data.h" // IWYU pragma: keep
|
||||
#include "arrow/array/util.h" // IWYU pragma: keep
|
||||
@@ -0,0 +1,264 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
#include "arrow/visitor.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// User array accessor types
|
||||
|
||||
/// \brief Array base type
|
||||
/// Immutable data array with some logical type and some length.
|
||||
///
|
||||
/// Any memory is owned by the respective Buffer instance (or its parents).
|
||||
///
|
||||
/// The base class is only required to have a null bitmap buffer if the null
|
||||
/// count is greater than 0
|
||||
///
|
||||
/// If known, the null count can be provided in the base Array constructor. If
|
||||
/// the null count is not known, pass -1 to indicate that the null count is to
|
||||
/// be computed on the first call to null_count()
|
||||
class ARROW_EXPORT Array {
|
||||
public:
|
||||
virtual ~Array() = default;
|
||||
|
||||
/// \brief Return true if value at index is null. Does not boundscheck
|
||||
bool IsNull(int64_t i) const {
|
||||
return null_bitmap_data_ != NULLPTR
|
||||
? !bit_util::GetBit(null_bitmap_data_, i + data_->offset)
|
||||
: data_->null_count == data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return true if value at index is valid (not null). Does not
|
||||
/// boundscheck
|
||||
bool IsValid(int64_t i) const {
|
||||
return null_bitmap_data_ != NULLPTR
|
||||
? bit_util::GetBit(null_bitmap_data_, i + data_->offset)
|
||||
: data_->null_count != data_->length;
|
||||
}
|
||||
|
||||
/// \brief Return a Scalar containing the value of this array at i
|
||||
Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
|
||||
|
||||
/// Size in the number of elements this array contains.
|
||||
int64_t length() const { return data_->length; }
|
||||
|
||||
/// A relative position into another array's data, to enable zero-copy
|
||||
/// slicing. This value defaults to zero
|
||||
int64_t offset() const { return data_->offset; }
|
||||
|
||||
/// The number of null entries in the array. If the null count was not known
|
||||
/// at time of construction (and set to a negative value), then the null
|
||||
/// count will be computed and cached on the first invocation of this
|
||||
/// function
|
||||
int64_t null_count() const;
|
||||
|
||||
std::shared_ptr<DataType> type() const { return data_->type; }
|
||||
Type::type type_id() const { return data_->type->id(); }
|
||||
|
||||
/// Buffer for the validity (null) bitmap, if any. Note that Union types
|
||||
/// never have a null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
|
||||
|
||||
/// Raw pointer to the null bitmap.
|
||||
///
|
||||
/// Note that for `null_count == 0` or for null type, this will be null.
|
||||
/// This buffer does not account for any slice offset
|
||||
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
|
||||
|
||||
/// Equality comparison with another array
|
||||
bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool Equals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Return the formatted unified diff of arrow::Diff between this
|
||||
/// Array and another Array
|
||||
std::string Diff(const Array& other) const;
|
||||
|
||||
/// Approximate equality comparison with another array
|
||||
///
|
||||
/// epsilon is only used if this is FloatArray or DoubleArray
|
||||
bool ApproxEquals(const std::shared_ptr<Array>& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool ApproxEquals(const Array& arr,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// Compare if the range of slots specified are equal for the given array and
|
||||
/// this array. end_idx exclusive. This methods does not bounds check.
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const Array& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
|
||||
const std::shared_ptr<Array>& other,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
|
||||
int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
|
||||
int64_t end_idx, int64_t other_start_idx,
|
||||
const EqualOptions& = EqualOptions::Defaults()) const;
|
||||
|
||||
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
|
||||
Status Accept(ArrayVisitor* visitor) const;
|
||||
|
||||
/// Construct a zero-copy view of this array with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
|
||||
|
||||
/// Construct a zero-copy slice of the array with the indicated offset and
|
||||
/// length
|
||||
///
|
||||
/// \param[in] offset the position of the first element in the constructed
|
||||
/// slice
|
||||
/// \param[in] length the length of the slice. If there are not enough
|
||||
/// elements in the array, the length will be adjusted accordingly
|
||||
///
|
||||
/// \return a new object wrapped in std::shared_ptr<Array>
|
||||
std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// Slice from offset until end of the array
|
||||
std::shared_ptr<Array> Slice(int64_t offset) const;
|
||||
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
/// Input-checking variant of Array::Slice
|
||||
Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
|
||||
|
||||
const std::shared_ptr<ArrayData>& data() const { return data_; }
|
||||
|
||||
int num_fields() const { return static_cast<int>(data_->child_data.size()); }
|
||||
|
||||
/// \return PrettyPrint representation of array suitable for debugging
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief Perform cheap validation checks to determine obvious inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is O(k) where k is the number of descendents.
|
||||
///
|
||||
/// \return Status
|
||||
Status Validate() const;
|
||||
|
||||
/// \brief Perform extensive validation checks to determine inconsistencies
|
||||
/// within the array's internal data.
|
||||
///
|
||||
/// This is potentially O(k*n) where k is the number of descendents and n
|
||||
/// is the array length.
|
||||
///
|
||||
/// \return Status
|
||||
Status ValidateFull() const;
|
||||
|
||||
protected:
|
||||
Array() = default;
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
|
||||
|
||||
std::shared_ptr<ArrayData> data_;
|
||||
const uint8_t* null_bitmap_data_ = NULLPTR;
|
||||
|
||||
/// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
if (data->buffers.size() > 0) {
|
||||
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
|
||||
} else {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
}
|
||||
data_ = data;
|
||||
}
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
|
||||
|
||||
ARROW_FRIEND_EXPORT friend void PrintTo(const Array& x, std::ostream* os);
|
||||
};
|
||||
|
||||
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
|
||||
os << x.ToString();
|
||||
return os;
|
||||
}
|
||||
|
||||
/// Base class for non-nested arrays
|
||||
class ARROW_EXPORT FlatArray : public Array {
|
||||
protected:
|
||||
using Array::Array;
|
||||
};
|
||||
|
||||
/// Base class for arrays of fixed-size logical types
|
||||
class ARROW_EXPORT PrimitiveArray : public FlatArray {
|
||||
public:
|
||||
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// Does not account for any slice offset
|
||||
std::shared_ptr<Buffer> values() const { return data_->buffers[1]; }
|
||||
|
||||
protected:
|
||||
PrimitiveArray() : raw_values_(NULLPTR) {}
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
|
||||
}
|
||||
|
||||
explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
|
||||
const uint8_t* raw_values_;
|
||||
};
|
||||
|
||||
/// Degenerate null type Array
|
||||
class ARROW_EXPORT NullArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = NullType;
|
||||
|
||||
explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
|
||||
explicit NullArray(int64_t length);
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
null_bitmap_data_ = NULLPTR;
|
||||
data->null_count = data->length;
|
||||
data_ = data;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,269 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for Binary, LargeBinart, String, LargeString,
|
||||
// FixedSizeBinary
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
/// Base class for variable-sized binary arrays, regardless of offset size
|
||||
/// and logical interpretation.
|
||||
template <typename TYPE>
|
||||
class BaseBinaryArray : public FlatArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
|
||||
|
||||
/// Return the pointer to the given elements bytes
|
||||
// XXX should GetValue(int64_t i) return a string_view?
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
// Account for base offset
|
||||
i += data_->offset;
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
*out_length = raw_value_offsets_[i + 1] - pos;
|
||||
return raw_data_ + pos;
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view GetView(int64_t i) const {
|
||||
// Account for base offset
|
||||
i += data_->offset;
|
||||
const offset_type pos = raw_value_offsets_[i];
|
||||
return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
|
||||
raw_value_offsets_[i + 1] - pos);
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
/// \brief Get binary value as a string_view
|
||||
/// Provided for consistency with other arrays.
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the view over the selected value
|
||||
std::string_view Value(int64_t i) const { return GetView(i); }
|
||||
|
||||
/// \brief Get binary value as a std::string
|
||||
///
|
||||
/// \param i the value index
|
||||
/// \return the value copied into a std::string
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
|
||||
|
||||
const offset_type* raw_value_offsets() const {
|
||||
return raw_value_offsets_ + data_->offset;
|
||||
}
|
||||
|
||||
const uint8_t* raw_data() const { return raw_data_; }
|
||||
|
||||
/// \brief Return the data buffer absolute offset of the data for the value
|
||||
/// at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const {
|
||||
return raw_value_offsets_[i + data_->offset];
|
||||
}
|
||||
|
||||
/// \brief Return the length of the data for the value at the passed index.
|
||||
///
|
||||
/// Does not perform boundschecking
|
||||
offset_type value_length(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
|
||||
/// \brief Return the total length of the memory in the data buffer
|
||||
/// referenced by this array. If the array has been sliced then this may be
|
||||
/// less than the size of the data buffer (data_->buffers[2]).
|
||||
offset_type total_values_length() const {
|
||||
if (data_->length > 0) {
|
||||
return raw_value_offsets_[data_->length + data_->offset] -
|
||||
raw_value_offsets_[data_->offset];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
// For subclasses
|
||||
BaseBinaryArray() = default;
|
||||
|
||||
// Protected method for constructors
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->Array::SetData(data);
|
||||
raw_value_offsets_ = data->GetValuesSafe<offset_type>(1, /*offset=*/0);
|
||||
raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
|
||||
}
|
||||
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
const uint8_t* raw_data_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size binary data
|
||||
class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
|
||||
public:
|
||||
explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as StringArray
|
||||
BinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for variable-size string (utf-8) data
|
||||
class ARROW_EXPORT StringArray : public BinaryArray {
|
||||
public:
|
||||
using TypeClass = StringType;
|
||||
|
||||
explicit StringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size binary data
|
||||
class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
|
||||
public:
|
||||
explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
protected:
|
||||
// For subclasses such as LargeStringArray
|
||||
LargeBinaryArray() : BaseBinaryArray() {}
|
||||
};
|
||||
|
||||
/// Concrete Array class for large variable-size string (utf-8) data
|
||||
class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
|
||||
public:
|
||||
using TypeClass = LargeStringType;
|
||||
|
||||
explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Validate that this array contains only valid UTF8 entries
|
||||
///
|
||||
/// This check is also implied by ValidateFull()
|
||||
Status ValidateUTF8() const;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Fixed width binary
|
||||
|
||||
/// Concrete Array class for fixed-size binary data
|
||||
class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
|
||||
|
||||
explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
const uint8_t* Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
std::string_view GetView(int64_t i) const {
|
||||
return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width());
|
||||
}
|
||||
|
||||
std::optional<std::string_view> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
std::string GetString(int64_t i) const { return std::string(GetView(i)); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width_; }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data) {
|
||||
this->PrimitiveArray::SetData(data);
|
||||
byte_width_ =
|
||||
internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
|
||||
}
|
||||
|
||||
int32_t byte_width_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,72 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal128Array
|
||||
|
||||
/// Concrete Array class for 128-bit decimal data
|
||||
class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal128Array from ArrayData instance
|
||||
explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
// Backward compatibility
|
||||
using DecimalArray = Decimal128Array;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Decimal256Array
|
||||
|
||||
/// Concrete Array class for 256-bit decimal data
|
||||
class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
|
||||
using FixedSizeBinaryArray::FixedSizeBinaryArray;
|
||||
|
||||
/// \brief Construct Decimal256Array from ArrayData instance
|
||||
explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
std::string FormatValue(int64_t i) const;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,180 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// DictionaryArray
|
||||
|
||||
/// \brief Array type for dictionary-encoded data with a
|
||||
/// data-dependent dictionary
|
||||
///
|
||||
/// A dictionary array contains an array of non-negative integers (the
|
||||
/// "dictionary indices") along with a data type containing a "dictionary"
|
||||
/// corresponding to the distinct values represented in the data.
|
||||
///
|
||||
/// For example, the array
|
||||
///
|
||||
/// ["foo", "bar", "foo", "bar", "foo", "bar"]
|
||||
///
|
||||
/// with dictionary ["bar", "foo"], would have dictionary array representation
|
||||
///
|
||||
/// indices: [1, 0, 1, 0, 1, 0]
|
||||
/// dictionary: ["bar", "foo"]
|
||||
///
|
||||
/// The indices in principle may be any integer type.
|
||||
class ARROW_EXPORT DictionaryArray : public Array {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
|
||||
explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DictionaryArray(const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
/// \brief Construct DictionaryArray from dictionary and indices
|
||||
/// array and validate
|
||||
///
|
||||
/// This function does the validation of the indices and input type. It checks if
|
||||
/// all indices are non-negative and smaller than the size of the dictionary.
|
||||
///
|
||||
/// \param[in] type a dictionary type
|
||||
/// \param[in] dictionary the dictionary with same value type as the
|
||||
/// type object
|
||||
/// \param[in] indices an array of non-negative integers smaller than the
|
||||
/// size of the dictionary
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
|
||||
const std::shared_ptr<Array>& dictionary);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
|
||||
return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
|
||||
dictionary);
|
||||
}
|
||||
|
||||
/// \brief Transpose this DictionaryArray
|
||||
///
|
||||
/// This method constructs a new dictionary array with the given dictionary
|
||||
/// type, transposing indices using the transpose map. The type and the
|
||||
/// transpose map are typically computed using DictionaryUnifier.
|
||||
///
|
||||
/// \param[in] type the new type object
|
||||
/// \param[in] dictionary the new dictionary
|
||||
/// \param[in] transpose_map transposition array of this array's indices
|
||||
/// into the target array's indices
|
||||
/// \param[in] pool a pool to allocate the array data from
|
||||
Result<std::shared_ptr<Array>> Transpose(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Determine whether dictionary arrays may be compared without unification
|
||||
bool CanCompareIndices(const DictionaryArray& other) const;
|
||||
|
||||
/// \brief Return the dictionary for this array, which is stored as
|
||||
/// a member of the ArrayData internal structure
|
||||
std::shared_ptr<Array> dictionary() const;
|
||||
std::shared_ptr<Array> indices() const;
|
||||
|
||||
/// \brief Return the ith value of indices, cast to int64_t. Not recommended
|
||||
/// for use in performance-sensitive code. Does not validate whether the
|
||||
/// value is null or out-of-bounds.
|
||||
int64_t GetValueIndex(int64_t i) const;
|
||||
|
||||
const DictionaryType* dict_type() const { return dict_type_; }
|
||||
|
||||
private:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
const DictionaryType* dict_type_;
|
||||
std::shared_ptr<Array> indices_;
|
||||
|
||||
// Lazily initialized when invoking dictionary()
|
||||
mutable std::shared_ptr<Array> dictionary_;
|
||||
};
|
||||
|
||||
/// \brief Helper class for incremental dictionary unification
|
||||
class ARROW_EXPORT DictionaryUnifier {
|
||||
public:
|
||||
virtual ~DictionaryUnifier() = default;
|
||||
|
||||
/// \brief Construct a DictionaryUnifier
|
||||
/// \param[in] value_type the data type of the dictionaries
|
||||
/// \param[in] pool MemoryPool to use for memory allocations
|
||||
static Result<std::unique_ptr<DictionaryUnifier>> Make(
|
||||
std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries accross array chunks
|
||||
///
|
||||
/// The dictionaries in the array chunks will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
|
||||
const std::shared_ptr<ChunkedArray>& array,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Unify dictionaries accross the chunks of each table column
|
||||
///
|
||||
/// The dictionaries in each table column will be unified, their indices
|
||||
/// accordingly transposed.
|
||||
///
|
||||
/// Only dictionaries with a primitive value type are currently supported.
|
||||
/// However, dictionaries nested inside a more complex type are correctly unified.
|
||||
static Result<std::shared_ptr<Table>> UnifyTable(
|
||||
const Table& table, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Append dictionary to the internal memo
|
||||
virtual Status Unify(const Array& dictionary) = 0;
|
||||
|
||||
/// \brief Append dictionary and compute transpose indices
|
||||
/// \param[in] dictionary the dictionary values to unify
|
||||
/// \param[out] out_transpose a Buffer containing computed transpose indices
|
||||
/// as int32_t values equal in length to the passed dictionary. The value in
|
||||
/// each slot corresponds to the new index value for each original index
|
||||
/// for a DictionaryArray with the old dictionary
|
||||
virtual Status Unify(const Array& dictionary,
|
||||
std::shared_ptr<Buffer>* out_transpose) = 0;
|
||||
|
||||
/// \brief Return a result DictionaryType with the smallest possible index
|
||||
/// type to accommodate the unified dictionary. The unifier cannot be used
|
||||
/// after this is called
|
||||
virtual Status GetResult(std::shared_ptr<DataType>* out_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
|
||||
/// \brief Return a unified dictionary with the given index type. If
|
||||
/// the index type is not large enough then an invalid status will be returned.
|
||||
/// The unifier cannot be used after this is called
|
||||
virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
|
||||
std::shared_ptr<Array>* out_dict) = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,584 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor classes for List, LargeList, FixedSizeList, Map, Struct, and
|
||||
// Union
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// ListArray
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListArray;
|
||||
|
||||
namespace internal {
|
||||
|
||||
// Private helper for ListArray::SetData.
|
||||
// Unfortunately, trying to define BaseListArray::SetData outside of this header
|
||||
// doesn't play well with MSVC.
|
||||
template <typename TYPE>
|
||||
void SetListData(BaseListArray<TYPE>* self, const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id = TYPE::type_id);
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// Base class for variable-sized list arrays, regardless of offset size.
|
||||
template <typename TYPE>
|
||||
class BaseListArray : public Array {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
const TypeClass* list_type() const { return list_type_; }
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
///
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
std::shared_ptr<Array> values() const { return values_; }
|
||||
|
||||
/// Note that this buffer does not account for any slice offset or length.
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
|
||||
|
||||
std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
|
||||
|
||||
/// Return pointer to raw value offsets accounting for any slice offset
|
||||
const offset_type* raw_value_offsets() const {
|
||||
return raw_value_offsets_ + data_->offset;
|
||||
}
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
offset_type value_offset(int64_t i) const {
|
||||
return raw_value_offsets_[i + data_->offset];
|
||||
}
|
||||
offset_type value_length(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
|
||||
}
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
protected:
|
||||
friend void internal::SetListData<TYPE>(BaseListArray<TYPE>* self,
|
||||
const std::shared_ptr<ArrayData>& data,
|
||||
Type::type expected_type_id);
|
||||
|
||||
const TypeClass* list_type_ = NULLPTR;
|
||||
std::shared_ptr<Array> values_;
|
||||
const offset_type* raw_value_offsets_ = NULLPTR;
|
||||
};
|
||||
|
||||
/// Concrete Array class for list data
|
||||
class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
|
||||
public:
|
||||
explicit ListArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
ListArray(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct ListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// Offsets of an Array's null bitmap can be present or an explicit
|
||||
/// null_bitmap, but not both.
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<ListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int32Array
|
||||
///
|
||||
/// The returned array will not have a validity bitmap, so you cannot expect
|
||||
/// to pass it to ListArray::FromArrays() and get back the same list array
|
||||
/// if the original one has nulls.
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
// This constructor defers SetData to a derived array class
|
||||
ListArray() = default;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// Concrete Array class for large list data (with 64-bit offsets)
|
||||
class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
|
||||
public:
|
||||
explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct LargeListArray from array of offsets and child value array
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int64 type
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
/// \param[in] null_bitmap Optional validity bitmap
|
||||
/// \param[in] null_count Optional null count in null_bitmap
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
static Result<std::shared_ptr<LargeListArray>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount);
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration of this array's offsets as well as null elements backed
|
||||
/// by non-empty lists (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Return list offsets as an Int64Array
|
||||
std::shared_ptr<Array> offsets() const;
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// MapArray
|
||||
|
||||
/// Concrete Array class for map data
|
||||
///
|
||||
/// NB: "value" in this context refers to a pair of a key and the corresponding item
|
||||
class ARROW_EXPORT MapArray : public ListArray {
|
||||
public:
|
||||
using TypeClass = MapType;
|
||||
|
||||
explicit MapArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MapArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& value_offsets,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct MapArray from array of offsets and child key, item arrays
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types, and will allocate a new offsets array if necessary (i.e. if
|
||||
/// the offsets contain any nulls). If the offsets do not have nulls, they
|
||||
/// are assumed to be well-formed
|
||||
///
|
||||
/// \param[in] offsets Array containing n + 1 offsets encoding length and
|
||||
/// size. Must be of int32 type
|
||||
/// \param[in] keys Array containing key values
|
||||
/// \param[in] items Array containing item values
|
||||
/// \param[in] pool MemoryPool in case new offsets array needs to be
|
||||
/// allocated because of null values
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
|
||||
const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArrays(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
const MapType* map_type() const { return map_type_; }
|
||||
|
||||
/// \brief Return array object containing all map keys
|
||||
std::shared_ptr<Array> keys() const { return keys_; }
|
||||
|
||||
/// \brief Return array object containing all mapped items
|
||||
std::shared_ptr<Array> items() const { return items_; }
|
||||
|
||||
/// Validate child data before constructing the actual MapArray.
|
||||
static Status ValidateChildData(
|
||||
const std::vector<std::shared_ptr<ArrayData>>& child_data);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
static Result<std::shared_ptr<Array>> FromArraysInternal(
|
||||
std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
|
||||
const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
|
||||
MemoryPool* pool);
|
||||
|
||||
private:
|
||||
const MapType* map_type_;
|
||||
std::shared_ptr<Array> keys_, items_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeListArray
|
||||
|
||||
/// Concrete Array class for fixed size list data
|
||||
class ARROW_EXPORT FixedSizeListArray : public Array {
|
||||
public:
|
||||
using TypeClass = FixedSizeListType;
|
||||
using offset_type = TypeClass::offset_type;
|
||||
|
||||
explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Array>& values,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const FixedSizeListType* list_type() const;
|
||||
|
||||
/// \brief Return array object containing the list's values
|
||||
std::shared_ptr<Array> values() const;
|
||||
|
||||
std::shared_ptr<DataType> value_type() const;
|
||||
|
||||
// The following functions will not perform boundschecking
|
||||
int64_t value_offset(int64_t i) const {
|
||||
i += data_->offset;
|
||||
return list_size_ * i;
|
||||
}
|
||||
int32_t value_length(int64_t i = 0) const {
|
||||
ARROW_UNUSED(i);
|
||||
return list_size_;
|
||||
}
|
||||
std::shared_ptr<Array> value_slice(int64_t i) const {
|
||||
return values_->Slice(value_offset(i), value_length(i));
|
||||
}
|
||||
|
||||
/// \brief Return an Array that is a concatenation of the lists in this array.
|
||||
///
|
||||
/// Note that it's different from `values()` in that it takes into
|
||||
/// consideration null elements (they are skipped, thus copying may be needed).
|
||||
Result<std::shared_ptr<Array>> Flatten(
|
||||
MemoryPool* memory_pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and value_length
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] list_size The fixed length of each list
|
||||
/// \return Will have length equal to values.length() / list_size
|
||||
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
|
||||
int32_t list_size);
|
||||
|
||||
/// \brief Construct FixedSizeListArray from child value array and type
|
||||
///
|
||||
/// \param[in] values Array containing list values
|
||||
/// \param[in] type The fixed sized list type
|
||||
/// \return Will have length equal to values.length() / type.list_size()
|
||||
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
|
||||
std::shared_ptr<DataType> type);
|
||||
|
||||
protected:
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
int32_t list_size_;
|
||||
|
||||
private:
|
||||
std::shared_ptr<Array> values_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
/// Concrete Array class for struct data
|
||||
class ARROW_EXPORT StructArray : public Array {
|
||||
public:
|
||||
using TypeClass = StructType;
|
||||
|
||||
explicit StructArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
StructArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::vector<std::shared_ptr<Array>>& children,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and field names.
|
||||
///
|
||||
/// The length and data type are automatically inferred from the arguments.
|
||||
/// There should be at least one child array.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const std::vector<std::string>& field_names,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
/// \brief Return a StructArray from child arrays and fields.
|
||||
///
|
||||
/// The length is automatically inferred from the arguments.
|
||||
/// There should be at least one child array. This method does not
|
||||
/// check that field types and child array types are consistent.
|
||||
static Result<std::shared_ptr<StructArray>> Make(
|
||||
const ArrayVector& children, const FieldVector& fields,
|
||||
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
const StructType* struct_type() const;
|
||||
|
||||
// Return a shared pointer in case the requestor desires to share ownership
|
||||
// with this array. The returned array has its offset, length and null
|
||||
// count adjusted.
|
||||
const std::shared_ptr<Array>& field(int pos) const;
|
||||
|
||||
const ArrayVector& fields() const;
|
||||
|
||||
/// Returns null if name not found
|
||||
std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
|
||||
|
||||
/// \brief Flatten this array as a vector of arrays, one for each field
|
||||
///
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// \brief Get one of the child arrays, combining its null bitmap
|
||||
/// with the parent struct array's bitmap.
|
||||
///
|
||||
/// \param[in] index Which child array to get
|
||||
/// \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
private:
|
||||
// For caching boxed child data
|
||||
// XXX This is not handled in a thread-safe manner.
|
||||
mutable ArrayVector boxed_fields_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Union
|
||||
|
||||
/// Base class for SparseUnionArray and DenseUnionArray
|
||||
class ARROW_EXPORT UnionArray : public Array {
|
||||
public:
|
||||
using type_code_t = int8_t;
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> type_codes() const { return data_->buffers[1]; }
|
||||
|
||||
const type_code_t* raw_type_codes() const { return raw_type_codes_ + data_->offset; }
|
||||
|
||||
/// The logical type code of the value at index.
|
||||
type_code_t type_code(int64_t i) const { return raw_type_codes_[i + data_->offset]; }
|
||||
|
||||
/// The physical child id containing value at index.
|
||||
int child_id(int64_t i) const {
|
||||
return union_type_->child_ids()[raw_type_codes_[i + data_->offset]];
|
||||
}
|
||||
|
||||
const UnionType* union_type() const { return union_type_; }
|
||||
|
||||
UnionMode::type mode() const { return union_type_->mode(); }
|
||||
|
||||
/// \brief Return the given field as an individual array.
|
||||
///
|
||||
/// For sparse unions, the returned array has its offset, length and null
|
||||
/// count adjusted.
|
||||
std::shared_ptr<Array> field(int pos) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
|
||||
const type_code_t* raw_type_codes_;
|
||||
const UnionType* union_type_;
|
||||
|
||||
// For caching boxed child data
|
||||
mutable std::vector<std::shared_ptr<Array>> boxed_fields_;
|
||||
};
|
||||
|
||||
/// Concrete Array class for sparse union data
|
||||
class ARROW_EXPORT SparseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = SparseUnionType;
|
||||
|
||||
explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
|
||||
|
||||
SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct SparseUnionArray from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct SparseUnionArray with custom field names from type_ids and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const SparseUnionType* union_type() const {
|
||||
return internal::checked_cast<const SparseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// \brief Get one of the child arrays, adjusting its null bitmap
|
||||
/// where the union array type code does not match.
|
||||
///
|
||||
/// \param[in] index Which child array to get (i.e. the physical index, not the type
|
||||
/// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
|
||||
Result<std::shared_ptr<Array>> GetFlattenedField(
|
||||
int index, MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
protected:
|
||||
void SetData(std::shared_ptr<ArrayData> data);
|
||||
};
|
||||
|
||||
/// \brief Concrete Array class for dense union data
|
||||
///
|
||||
/// Note that union types do not have a validity bitmap
|
||||
class ARROW_EXPORT DenseUnionArray : public UnionArray {
|
||||
public:
|
||||
using TypeClass = DenseUnionType;
|
||||
|
||||
explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
|
||||
std::shared_ptr<Buffer> type_ids,
|
||||
std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
|
||||
|
||||
/// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<type_code_t> type_codes) {
|
||||
return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
|
||||
std::move(type_codes));
|
||||
}
|
||||
|
||||
/// \brief Construct DenseUnionArray with custom field names from type_ids,
|
||||
/// value_offsets, and children
|
||||
///
|
||||
/// This function does the bare minimum of validation of the offsets and
|
||||
/// input types.
|
||||
///
|
||||
/// \param[in] type_ids An array of logical type ids for the union type
|
||||
/// \param[in] value_offsets An array of signed int32 values indicating the
|
||||
/// relative offset into the respective child array for the type in a given slot.
|
||||
/// The respective offsets for each child value array must be in order / increasing.
|
||||
/// \param[in] children Vector of children Arrays containing the data for each type.
|
||||
/// \param[in] field_names Vector of strings containing the name of each field.
|
||||
/// \param[in] type_codes Vector of type codes.
|
||||
static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
|
||||
const Array& value_offsets,
|
||||
ArrayVector children,
|
||||
std::vector<std::string> field_names = {},
|
||||
std::vector<type_code_t> type_codes = {});
|
||||
|
||||
const DenseUnionType* union_type() const {
|
||||
return internal::checked_cast<const DenseUnionType*>(union_type_);
|
||||
}
|
||||
|
||||
/// Note that this buffer does not account for any slice offset
|
||||
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[2]; }
|
||||
|
||||
int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
|
||||
|
||||
const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
|
||||
|
||||
protected:
|
||||
const int32_t* raw_value_offsets_;
|
||||
|
||||
void SetData(const std::shared_ptr<ArrayData>& data);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,202 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Array accessor types for primitive/C-type-based arrays, such as numbers,
|
||||
// boolean, and temporal types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/stl_iterator.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_fwd.h" // IWYU pragma: export
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// Concrete Array class for boolean data
|
||||
class ARROW_EXPORT BooleanArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using IteratorType = stl::ArrayIterator<BooleanArray>;
|
||||
|
||||
explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
bool Value(int64_t i) const {
|
||||
return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
|
||||
i + data_->offset);
|
||||
}
|
||||
|
||||
bool GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
std::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
|
||||
|
||||
/// \brief Return the number of false (0) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t false_count() const;
|
||||
|
||||
/// \brief Return the number of true (1) values among the valid
|
||||
/// values. Result is not cached.
|
||||
int64_t true_count() const;
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-arrays
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Concrete Array class for numeric data with a corresponding C type
|
||||
///
|
||||
/// This class is templated on the corresponding DataType subclass for the
|
||||
/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
|
||||
///
|
||||
/// Note that convenience aliases are available for all accepted types
|
||||
/// (for example Int8Array for NumericArray<Int8Type>).
|
||||
template <typename TYPE>
|
||||
class NumericArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using value_type = typename TypeClass::c_type;
|
||||
using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
|
||||
|
||||
explicit NumericArray(const std::shared_ptr<ArrayData>& data) : PrimitiveArray(data) {}
|
||||
|
||||
// Only enable this constructor without a type argument for types without additional
|
||||
// metadata
|
||||
template <typename T1 = TYPE>
|
||||
NumericArray(enable_if_parameter_free<T1, int64_t> length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
|
||||
null_count, offset) {}
|
||||
|
||||
const value_type* raw_values() const {
|
||||
return reinterpret_cast<const value_type*>(raw_values_) + data_->offset;
|
||||
}
|
||||
|
||||
value_type Value(int64_t i) const { return raw_values()[i]; }
|
||||
|
||||
// For API compatibility with BinaryArray etc.
|
||||
value_type GetView(int64_t i) const { return Value(i); }
|
||||
|
||||
std::optional<value_type> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
protected:
|
||||
using PrimitiveArray::PrimitiveArray;
|
||||
};
|
||||
|
||||
/// DayTimeArray
|
||||
/// ---------------------
|
||||
/// \brief Array of Day and Millisecond values.
|
||||
class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = DayTimeIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
|
||||
|
||||
explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::DayMilliseconds GetValue(int64_t i) const;
|
||||
TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// \brief Array of Month, Day and nanosecond values.
|
||||
class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
|
||||
public:
|
||||
using TypeClass = MonthDayNanoIntervalType;
|
||||
using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
|
||||
|
||||
explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
|
||||
const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
|
||||
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
TypeClass::MonthDayNanos GetValue(int64_t i) const;
|
||||
TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
|
||||
|
||||
// For compatibility with Take kernel.
|
||||
TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
|
||||
|
||||
IteratorType begin() const { return IteratorType(*this); }
|
||||
|
||||
IteratorType end() const { return IteratorType(*this, length()); }
|
||||
|
||||
std::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
|
||||
return *IteratorType(*this, i);
|
||||
}
|
||||
|
||||
int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
|
||||
|
||||
const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,217 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilderBase(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
/// \brief Append multiple nulls
|
||||
/// \param[in] length the number of nulls to append
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 0;
|
||||
pending_has_nulls_ = true;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
++null_count_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(CommitPendingData());
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
|
||||
UnsafeSetNotNull(length);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
pending_data_[pending_pos_] = 0;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
protected:
|
||||
Status AppendInternal(const uint64_t val) {
|
||||
pending_data_[pending_pos_] = val;
|
||||
pending_valid_[pending_pos_] = 1;
|
||||
++pending_pos_;
|
||||
++length_;
|
||||
|
||||
if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
|
||||
return CommitPendingData();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
virtual Status CommitPendingData() = 0;
|
||||
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
template <typename new_type, typename old_type>
|
||||
typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
|
||||
ExpandIntSizeInternal();
|
||||
|
||||
std::shared_ptr<ResizableBuffer> data_;
|
||||
uint8_t* raw_data_ = NULLPTR;
|
||||
|
||||
const uint8_t start_int_size_;
|
||||
uint8_t int_size_;
|
||||
|
||||
static constexpr int32_t pending_size_ = 1024;
|
||||
uint8_t pending_valid_[pending_size_];
|
||||
uint64_t pending_data_[pending_size_];
|
||||
int32_t pending_pos_ = 0;
|
||||
bool pending_has_nulls_ = false;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveUIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
|
||||
: AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
|
||||
|
||||
using ArrayBuilder::Advance;
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const uint64_t val) { return AppendInternal(val); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const uint64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
|
||||
public:
|
||||
explicit AdaptiveIntBuilder(uint8_t start_int_size,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {}
|
||||
|
||||
using ArrayBuilder::Advance;
|
||||
using internal::AdaptiveIntBuilderBase::Reset;
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
protected:
|
||||
Status CommitPendingData() override;
|
||||
Status ExpandIntSize(uint8_t new_int_size);
|
||||
|
||||
Status AppendValuesInternal(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes);
|
||||
|
||||
template <typename new_type>
|
||||
Status ExpandIntSizeN();
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,352 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm> // IWYU pragma: keep
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_primitive.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \defgroup numeric-builders Concrete builder subclasses for numeric types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup temporal-builders Concrete builder subclasses for temporal types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup binary-builders Concrete builder subclasses for binary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup nested-builders Concrete builder subclasses for nested types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
|
||||
/// @{
|
||||
/// @}
|
||||
|
||||
constexpr int64_t kMinBuilderCapacity = 1 << 5;
|
||||
constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
|
||||
|
||||
/// Base class for all data array builders.
|
||||
///
|
||||
/// This class provides a facilities for incrementally building the null bitmap
|
||||
/// (see Append methods) and as a side effect the current number of slots and
|
||||
/// the null count.
|
||||
///
|
||||
/// \note Users are expected to use builders as one of the concrete types below.
|
||||
/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
|
||||
class ARROW_EXPORT ArrayBuilder {
|
||||
public:
|
||||
explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
|
||||
: pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}
|
||||
|
||||
ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
|
||||
|
||||
virtual ~ArrayBuilder() = default;
|
||||
|
||||
/// For nested types. Since the objects are owned by this class instance, we
|
||||
/// skip shared pointers and just return a raw pointer
|
||||
ArrayBuilder* child(int i) { return children_[i].get(); }
|
||||
|
||||
const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
|
||||
|
||||
int num_children() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
virtual int64_t length() const { return length_; }
|
||||
int64_t null_count() const { return null_count_; }
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Ensure that enough memory has been allocated to fit the indicated
|
||||
/// number of total elements in the builder, including any that have already
|
||||
/// been appended. Does not account for reallocations that may be due to
|
||||
/// variable size data, like binary values. To make space for incremental
|
||||
/// appends, use Reserve instead.
|
||||
///
|
||||
/// \param[in] capacity the minimum number of total array values to
|
||||
/// accommodate. Must be greater than the current capacity.
|
||||
/// \return Status
|
||||
virtual Status Resize(int64_t capacity);
|
||||
|
||||
/// \brief Ensure that there is enough space allocated to append the indicated
|
||||
/// number of elements without any further reallocation. Overallocation is
|
||||
/// used in order to minimize the impact of incremental Reserve() calls.
|
||||
/// Note that additional_capacity is relative to the current number of elements
|
||||
/// rather than to the current capacity, so calls to Reserve() which are not
|
||||
/// interspersed with addition of new elements may not increase the capacity.
|
||||
///
|
||||
/// \param[in] additional_capacity the number of additional array values
|
||||
/// \return Status
|
||||
Status Reserve(int64_t additional_capacity) {
|
||||
auto current_capacity = capacity();
|
||||
auto min_capacity = length() + additional_capacity;
|
||||
if (min_capacity <= current_capacity) return Status::OK();
|
||||
|
||||
// leave growth factor up to BufferBuilder
|
||||
auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
|
||||
return Resize(new_capacity);
|
||||
}
|
||||
|
||||
/// Reset the builder.
|
||||
virtual void Reset();
|
||||
|
||||
/// \brief Append a null value to builder
|
||||
virtual Status AppendNull() = 0;
|
||||
/// \brief Append a number of null values to builder
|
||||
virtual Status AppendNulls(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a non-null value to builder
|
||||
///
|
||||
/// The appended value is an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending a null value to a parent nested type.
|
||||
virtual Status AppendEmptyValue() = 0;
|
||||
|
||||
/// \brief Append a number of non-null values to builder
|
||||
///
|
||||
/// The appended values are an implementation detail, but the corresponding
|
||||
/// memory slot is guaranteed to be initialized.
|
||||
/// This method is useful when appending null values to a parent nested type.
|
||||
virtual Status AppendEmptyValues(int64_t length) = 0;
|
||||
|
||||
/// \brief Append a value from a scalar
|
||||
Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
|
||||
virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
|
||||
virtual Status AppendScalars(const ScalarVector& scalars);
|
||||
|
||||
/// \brief Append a range of values from an array.
|
||||
///
|
||||
/// The given array must be the same type as the builder.
|
||||
virtual Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) {
|
||||
return Status::NotImplemented("AppendArraySlice for builder for ", *type());
|
||||
}
|
||||
|
||||
/// For cases where raw data was memcpy'd into the internal buffers, allows us
|
||||
/// to advance the length of the builder. It is your responsibility to use
|
||||
/// this function responsibly.
|
||||
ARROW_DEPRECATED(
|
||||
"Deprecated in 6.0.0. ArrayBuilder::Advance is poorly supported and mostly "
|
||||
"untested.\nFor low-level control over buffer construction, use BufferBuilder "
|
||||
"or TypedBufferBuilder directly.")
|
||||
Status Advance(int64_t elements);
|
||||
|
||||
/// \brief Return result of builder as an internal generic ArrayData
|
||||
/// object. Resets builder except for dictionary builder
|
||||
///
|
||||
/// \param[out] out the finalized ArrayData object
|
||||
/// \return Status
|
||||
virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \param[out] out the finalized Array object
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Array>* out);
|
||||
|
||||
/// \brief Return result of builder as an Array object.
|
||||
///
|
||||
/// The builder is reset except for DictionaryBuilder.
|
||||
///
|
||||
/// \return The finalized Array object
|
||||
Result<std::shared_ptr<Array>> Finish();
|
||||
|
||||
/// \brief Return the type of the built Array
|
||||
virtual std::shared_ptr<DataType> type() const = 0;
|
||||
|
||||
protected:
|
||||
/// Append to null bitmap
|
||||
Status AppendToBitmap(bool is_valid);
|
||||
|
||||
/// Vector append. Treat each zero byte as a null. If valid_bytes is null
|
||||
/// assume all of length bits are valid.
|
||||
Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
|
||||
|
||||
/// Uniform append. Append N times the same validity bit.
|
||||
Status AppendToBitmap(int64_t num_bits, bool value);
|
||||
|
||||
/// Set the next length bits to not null (i.e. valid).
|
||||
Status SetNotNull(int64_t length);
|
||||
|
||||
// Unsafe operations (don't check capacity/don't resize)
|
||||
|
||||
void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
|
||||
|
||||
// Append to null bitmap, update the length
|
||||
void UnsafeAppendToBitmap(bool is_valid) {
|
||||
null_bitmap_builder_.UnsafeAppend(is_valid);
|
||||
++length_;
|
||||
if (!is_valid) ++null_count_;
|
||||
}
|
||||
|
||||
// Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
|
||||
// assume all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
|
||||
if (valid_bytes == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Vector append. Copy from a given bitmap. If bitmap is null assume
|
||||
// all of length bits are valid.
|
||||
void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
|
||||
if (bitmap == NULLPTR) {
|
||||
return UnsafeSetNotNull(length);
|
||||
}
|
||||
null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
|
||||
length_ += length;
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
// Append the same validity value a given number of times.
|
||||
void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
|
||||
if (value) {
|
||||
UnsafeSetNotNull(num_bits);
|
||||
} else {
|
||||
UnsafeSetNull(num_bits);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
|
||||
|
||||
// Set the next validity bits to not null (i.e. valid).
|
||||
void UnsafeSetNotNull(int64_t length);
|
||||
|
||||
// Set the next validity bits to null (i.e. invalid).
|
||||
void UnsafeSetNull(int64_t length);
|
||||
|
||||
static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
|
||||
|
||||
/// \brief Finish to an array of the specified ArrayType
|
||||
template <typename ArrayType>
|
||||
Status FinishTyped(std::shared_ptr<ArrayType>* out) {
|
||||
std::shared_ptr<Array> out_untyped;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out_untyped));
|
||||
*out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check the requested capacity for validity
|
||||
Status CheckCapacity(int64_t new_capacity) {
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
|
||||
return Status::Invalid(
|
||||
"Resize capacity must be positive (requested: ", new_capacity, ")");
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
|
||||
return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
|
||||
", current length: ", length_, ")");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Check for array type
|
||||
Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
|
||||
const Array& array, const char* message);
|
||||
Status CheckArrayType(Type::type expected_type, const Array& array,
|
||||
const char* message);
|
||||
|
||||
MemoryPool* pool_;
|
||||
int64_t alignment_;
|
||||
|
||||
TypedBufferBuilder<bool> null_bitmap_builder_;
|
||||
int64_t null_count_ = 0;
|
||||
|
||||
// Array length, so far. Also, the index of the next element to be added
|
||||
int64_t length_ = 0;
|
||||
int64_t capacity_ = 0;
|
||||
|
||||
// Child value array builders. These are owned by this class
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> children_;
|
||||
|
||||
private:
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
|
||||
};
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the data type to create the builder for
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
/// \brief Construct an empty ArrayBuilder corresponding to the data
|
||||
/// type, where any top-level or nested dictionary builders return the
|
||||
/// exact index type specified by the type.
|
||||
ARROW_EXPORT
|
||||
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
|
||||
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
/// \brief Construct an empty DictionaryBuilder initialized optionally
|
||||
/// with a pre-existing dictionary
|
||||
/// \param[in] pool the MemoryPool to use for allocations
|
||||
/// \param[in] type the dictionary type to create the builder for
|
||||
/// \param[in] dictionary the initial dictionary, if any. May be nullptr
|
||||
/// \param[out] out the created ArrayBuilder
|
||||
ARROW_EXPORT
|
||||
Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
|
||||
const std::shared_ptr<Array>& dictionary,
|
||||
std::unique_ptr<ArrayBuilder>* out);
|
||||
|
||||
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
|
||||
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool()) {
|
||||
std::unique_ptr<ArrayBuilder> out;
|
||||
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,707 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <numeric>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup binary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary and String
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_data_builder_(pool, alignment) {}
|
||||
|
||||
BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
|
||||
: BaseBinaryBuilder(pool) {}
|
||||
|
||||
Status Append(const uint8_t* value, offset_type length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value, offset_type length) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
Status Append(std::string_view value) {
|
||||
return Append(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Extend the last appended value by appending more data at the end
|
||||
///
|
||||
/// Unlike Append, this does not create a new offset.
|
||||
Status ExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
// Safety check for UBSAN.
|
||||
if (ARROW_PREDICT_TRUE(length > 0)) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExtendCurrent(std::string_view value) {
|
||||
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append without checking capacity
|
||||
///
|
||||
/// Offsets and data should have been presized using Reserve() and
|
||||
/// ReserveData(), respectively.
|
||||
void UnsafeAppend(const uint8_t* value, offset_type length) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value, offset_type length) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const std::string& value) {
|
||||
UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
/// Like ExtendCurrent, but do not check capacity
|
||||
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
|
||||
value_data_builder_.UnsafeAppend(value, length);
|
||||
}
|
||||
|
||||
void UnsafeExtendCurrent(std::string_view value) {
|
||||
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<offset_type>(value.size()));
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppendEmptyValue() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of strings in one shot.
|
||||
///
|
||||
/// \param[in] values a vector of strings
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<std::string>& values,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = std::accumulate(
|
||||
values.begin(), values.end(), 0ULL,
|
||||
[](uint64_t sum, const std::string& str) { return sum + str.size(); });
|
||||
ARROW_RETURN_NOT_OK(Reserve(values.size()));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Reserve(total_length));
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Reserve(values.size()));
|
||||
|
||||
if (valid_bytes != NULLPTR) {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (std::size_t i = 0; i < values.size(); ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(
|
||||
reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
|
||||
}
|
||||
}
|
||||
|
||||
UnsafeAppendToBitmap(valid_bytes, values.size());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of nul-terminated strings in one shot.
|
||||
/// If one of the values is NULL, it is processed as a null
|
||||
/// value even if the corresponding valid_bytes entry is 1.
|
||||
///
|
||||
/// \param[in] values a contiguous C array of nul-terminated char *
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const char** values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
std::size_t total_length = 0;
|
||||
std::vector<std::size_t> value_lengths(length);
|
||||
bool have_null_value = false;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
if (values[i] != NULLPTR) {
|
||||
auto value_length = strlen(values[i]);
|
||||
value_lengths[i] = value_length;
|
||||
total_length += value_length;
|
||||
} else {
|
||||
have_null_value = true;
|
||||
}
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ReserveData(total_length));
|
||||
|
||||
if (valid_bytes) {
|
||||
int64_t valid_bytes_offset = 0;
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (valid_bytes[i]) {
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
} else {
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
|
||||
i - valid_bytes_offset);
|
||||
UnsafeAppendToBitmap(false);
|
||||
valid_bytes_offset = i + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
|
||||
} else {
|
||||
if (have_null_value) {
|
||||
std::vector<uint8_t> valid_vector(length, 0);
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
if (values[i]) {
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
valid_vector[i] = 1;
|
||||
}
|
||||
}
|
||||
UnsafeAppendToBitmap(valid_vector.data(), length);
|
||||
} else {
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
UnsafeAppendNextOffset();
|
||||
value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
|
||||
value_lengths[i]);
|
||||
}
|
||||
UnsafeAppendToBitmap(NULLPTR, length);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
auto bitmap = array.GetValues<uint8_t>(0, 0);
|
||||
auto offsets = array.GetValues<offset_type>(1);
|
||||
auto data = array.GetValues<uint8_t>(2, 0);
|
||||
for (int64_t i = 0; i < length; i++) {
|
||||
if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
|
||||
const offset_type start = offsets[offset + i];
|
||||
const offset_type end = offsets[offset + i + 1];
|
||||
ARROW_RETURN_NOT_OK(Append(data + start, end - start));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_data_builder_.Reset();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) {
|
||||
auto new_size = value_data_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return value_data_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
// Write final offset (values length)
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// These buffers' padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
|
||||
null_count_, 0);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const uint8_t* value_data() const { return value_data_builder_.data(); }
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return value_data_builder_.length(); }
|
||||
/// \return capacity of values buffer
|
||||
int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
|
||||
|
||||
/// \return data pointer of the value date builder
|
||||
const offset_type* offsets_data() const { return offsets_builder_.data(); }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
|
||||
const offset_type* offsets = offsets_builder_.data();
|
||||
const auto offset = offsets[i];
|
||||
if (i == (length_ - 1)) {
|
||||
*out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
|
||||
} else {
|
||||
*out_length = offsets[i + 1] - offset;
|
||||
}
|
||||
return value_data_builder_.data() + offset;
|
||||
}
|
||||
|
||||
offset_type offset(int64_t i) const { return offsets_data()[i]; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const {
|
||||
offset_type value_length;
|
||||
const uint8_t* value_data = GetValue(i, &value_length);
|
||||
return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
TypedBufferBuilder<uint8_t> value_data_builder_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
|
||||
void UnsafeAppendNextOffset() {
|
||||
const int64_t num_bytes = value_data_builder_.length();
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class BinaryBuilder
|
||||
/// \brief Builder class for variable-length binary data
|
||||
class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return binary(); }
|
||||
};
|
||||
|
||||
/// \class StringBuilder
|
||||
/// \brief Builder class for UTF8 strings
|
||||
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
|
||||
public:
|
||||
using BinaryBuilder::BinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return utf8(); }
|
||||
};
|
||||
|
||||
/// \class LargeBinaryBuilder
|
||||
/// \brief Builder class for large variable-length binary data
|
||||
class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
|
||||
public:
|
||||
using BaseBinaryBuilder::BaseBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_binary(); }
|
||||
};
|
||||
|
||||
/// \class LargeStringBuilder
|
||||
/// \brief Builder class for large UTF8 strings
|
||||
class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
|
||||
public:
|
||||
using LargeBinaryBuilder::LargeBinaryBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return large_utf8(); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeBinaryBuilder
|
||||
|
||||
class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = FixedSizeBinaryType;
|
||||
|
||||
explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
Status Append(const uint8_t* value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const char* value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
Status Append(const std::string_view& view) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(view);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::string& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(s);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const Buffer& s) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(std::string_view(s));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
|
||||
|
||||
template <size_t NBYTES>
|
||||
Status Append(const std::array<uint8_t, NBYTES>& value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(
|
||||
std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
|
||||
int64_t bitmap_offset);
|
||||
|
||||
Status AppendNull() final;
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(
|
||||
array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t* value) {
|
||||
UnsafeAppendToBitmap(true);
|
||||
if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
|
||||
byte_builder_.UnsafeAppend(value, byte_width_);
|
||||
}
|
||||
}
|
||||
|
||||
void UnsafeAppend(const char* value) {
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
|
||||
}
|
||||
|
||||
void UnsafeAppend(std::string_view value) {
|
||||
#ifndef NDEBUG
|
||||
CheckValueSize(static_cast<size_t>(value.size()));
|
||||
#endif
|
||||
UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); }
|
||||
|
||||
void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
UnsafeAppendToBitmap(false);
|
||||
byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_bytes) const {
|
||||
auto new_size = byte_builder_.length() + new_bytes;
|
||||
if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
|
||||
return Status::CapacityError("array cannot contain more than ", memory_limit(),
|
||||
" bytes, have ", new_size);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Ensures there is enough allocated capacity to append the indicated
|
||||
/// number of bytes to the value data buffer without additional allocations
|
||||
Status ReserveData(int64_t elements) {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
|
||||
return byte_builder_.Reserve(elements);
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \return size of values buffer so far
|
||||
int64_t value_data_length() const { return byte_builder_.length(); }
|
||||
|
||||
int32_t byte_width() const { return byte_width_; }
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
const uint8_t* GetValue(int64_t i) const;
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This view becomes invalid on the next modifying operation.
|
||||
std::string_view GetView(int64_t i) const;
|
||||
|
||||
static constexpr int64_t memory_limit() {
|
||||
return std::numeric_limits<int64_t>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_binary(byte_width_);
|
||||
}
|
||||
|
||||
protected:
|
||||
int32_t byte_width_;
|
||||
BufferBuilder byte_builder_;
|
||||
|
||||
/// Temporary access to a value.
|
||||
///
|
||||
/// This pointer becomes invalid on the next modifying operation.
|
||||
uint8_t* GetMutableValue(int64_t i) {
|
||||
uint8_t* data_ptr = byte_builder_.mutable_data();
|
||||
return data_ptr + i * byte_width_;
|
||||
}
|
||||
|
||||
void CheckValueSize(int64_t size);
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Chunked builders: build a sequence of BinaryArray or StringArray that are
|
||||
// limited to a particular size (to the upper limit of 2GB)
|
||||
|
||||
namespace internal {
|
||||
|
||||
class ARROW_EXPORT ChunkedBinaryBuilder {
|
||||
public:
|
||||
explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
virtual ~ChunkedBinaryBuilder() = default;
|
||||
|
||||
Status Append(const uint8_t* value, int32_t length) {
|
||||
if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
|
||||
max_chunk_value_length_)) {
|
||||
if (builder_->value_data_length() == 0) {
|
||||
// The current item is larger than max_chunk_size_;
|
||||
// this chunk will be oversize and hold *only* this item
|
||||
ARROW_RETURN_NOT_OK(builder_->Append(value, length));
|
||||
return NextChunk();
|
||||
}
|
||||
// The current item would cause builder_->value_data_length() to exceed
|
||||
// max_chunk_size_, so finish this chunk and append the current item to the next
|
||||
// chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
return Append(value, length);
|
||||
}
|
||||
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
// The current item would cause builder_->length() to exceed max_chunk_length_, so
|
||||
// finish this chunk and append the current item to the next chunk
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
|
||||
return builder_->Append(value, length);
|
||||
}
|
||||
|
||||
Status Append(const std::string_view& value) {
|
||||
return Append(reinterpret_cast<const uint8_t*>(value.data()),
|
||||
static_cast<int32_t>(value.size()));
|
||||
}
|
||||
|
||||
Status AppendNull() {
|
||||
if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
|
||||
ARROW_RETURN_NOT_OK(NextChunk());
|
||||
}
|
||||
return builder_->AppendNull();
|
||||
}
|
||||
|
||||
Status Reserve(int64_t values);
|
||||
|
||||
virtual Status Finish(ArrayVector* out);
|
||||
|
||||
protected:
|
||||
Status NextChunk();
|
||||
|
||||
// maximum total character data size per chunk
|
||||
int64_t max_chunk_value_length_;
|
||||
|
||||
// maximum elements allowed per chunk
|
||||
int64_t max_chunk_length_ = kListMaximumElements;
|
||||
|
||||
// when Reserve() would cause builder_ to exceed its max_chunk_length_,
|
||||
// add to extra_capacity_ instead and wait to reserve until the next chunk
|
||||
int64_t extra_capacity_ = 0;
|
||||
|
||||
std::unique_ptr<BinaryBuilder> builder_;
|
||||
std::vector<std::shared_ptr<Array>> chunks_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
|
||||
public:
|
||||
using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
|
||||
|
||||
Status Finish(ArrayVector* out) override;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,102 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_decimal.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_binary.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal128Type;
|
||||
using ValueType = Decimal128;
|
||||
|
||||
explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(Decimal128 val);
|
||||
void UnsafeAppend(Decimal128 val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal128Type> decimal_type_;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
|
||||
public:
|
||||
using TypeClass = Decimal256Type;
|
||||
using ValueType = Decimal256;
|
||||
|
||||
explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
using FixedSizeBinaryBuilder::Append;
|
||||
using FixedSizeBinaryBuilder::AppendValues;
|
||||
using FixedSizeBinaryBuilder::Reset;
|
||||
|
||||
Status Append(const Decimal256& val);
|
||||
void UnsafeAppend(const Decimal256& val);
|
||||
void UnsafeAppend(std::string_view val);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return decimal_type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Decimal256Type> decimal_type_;
|
||||
};
|
||||
|
||||
using DecimalBuilder = Decimal128Builder;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,730 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <type_traits>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_binary.h"
|
||||
#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_base.h" // IWYU pragma: export
|
||||
#include "arrow/array/builder_primitive.h" // IWYU pragma: export
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/array/util.h"
|
||||
#include "arrow/scalar.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
#include "arrow/util/bit_block_counter.h"
|
||||
#include "arrow/util/checked_cast.h"
|
||||
#include "arrow/util/decimal.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Dictionary builder
|
||||
|
||||
namespace internal {
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
struct DictionaryValue {
|
||||
using type = typename T::c_type;
|
||||
using PhysicalType = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_base_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType =
|
||||
typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
|
||||
BinaryType, LargeBinaryType>::type;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
|
||||
using type = std::string_view;
|
||||
using PhysicalType = BinaryType;
|
||||
};
|
||||
|
||||
class ARROW_EXPORT DictionaryMemoTable {
|
||||
public:
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
|
||||
DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
|
||||
~DictionaryMemoTable();
|
||||
|
||||
Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
|
||||
|
||||
/// \brief Insert new memo values
|
||||
Status InsertValues(const Array& values);
|
||||
|
||||
int32_t size() const;
|
||||
|
||||
template <typename T>
|
||||
Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
|
||||
// We want to keep the DictionaryMemoTable implementation private, also we can't
|
||||
// use extern template classes because of compiler issues (MinGW?). Instead,
|
||||
// we expose explicit function overrides for each supported physical type.
|
||||
const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
|
||||
return GetOrInsert(physical_type, value, out);
|
||||
}
|
||||
|
||||
private:
|
||||
Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
|
||||
Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
|
||||
Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
|
||||
Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
|
||||
Status GetOrInsert(const MonthDayNanoIntervalType*,
|
||||
MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
|
||||
Status GetOrInsert(const DayTimeIntervalType*,
|
||||
DayTimeIntervalType::DayMilliseconds value, int32_t* out);
|
||||
Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
|
||||
Status GetOrInsert(const FloatType*, float value, int32_t* out);
|
||||
Status GetOrInsert(const DoubleType*, double value, int32_t* out);
|
||||
|
||||
Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
|
||||
Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
|
||||
|
||||
class DictionaryMemoTableImpl;
|
||||
std::unique_ptr<DictionaryMemoTableImpl> impl_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \addtogroup dictionary-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Array builder for created encoded DictionaryArray from
|
||||
/// dense array
|
||||
///
|
||||
/// Unlike other builders, dictionary builder does not completely
|
||||
/// reset the state on Finish calls.
|
||||
template <typename BuilderType, typename T>
|
||||
class DictionaryBuilderBase : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = DictionaryType;
|
||||
using Value = typename DictionaryValue<T>::type;
|
||||
|
||||
// WARNING: the type given below is the value type, not the DictionaryType.
|
||||
// The DictionaryType is instantiated on the Finish() call.
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
!is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename B = BuilderType, typename T1 = T>
|
||||
DictionaryBuilderBase(uint8_t start_int_size,
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
|
||||
is_fixed_size_binary_type<T1>::value,
|
||||
const std::shared_ptr<DataType>&>
|
||||
value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(start_int_size, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
const std::shared_ptr<DataType>& index_type,
|
||||
enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
|
||||
delta_offset_(0),
|
||||
byte_width_(static_cast<const T1&>(*value_type).byte_width()),
|
||||
indices_builder_(index_type, pool, alignment),
|
||||
value_type_(value_type) {}
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
|
||||
: DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
|
||||
|
||||
// This constructor doesn't check for errors. Use InsertMemoValues instead.
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
|
||||
delta_offset_(0),
|
||||
byte_width_(-1),
|
||||
indices_builder_(pool, alignment),
|
||||
value_type_(dictionary->type()) {}
|
||||
|
||||
~DictionaryBuilderBase() override = default;
|
||||
|
||||
/// \brief The current number of entries in the dictionary
|
||||
int64_t dictionary_length() const { return memo_table_->size(); }
|
||||
|
||||
/// \brief The value byte width (for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
|
||||
return byte_width_;
|
||||
}
|
||||
|
||||
/// \brief Append a scalar value
|
||||
Status Append(Value value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
|
||||
int32_t memo_index;
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
|
||||
length_ += 1;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
|
||||
return Append(std::string_view(reinterpret_cast<const char*>(value), byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a fixed-width string (only for FixedSizeBinaryType)
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
|
||||
return Append(std::string_view(value, byte_width_));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
|
||||
return Append(reinterpret_cast<const char*>(value), length);
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for binary types)
|
||||
template <typename T1 = T>
|
||||
enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a string (only for string types)
|
||||
template <typename T1 = T>
|
||||
enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
|
||||
return Append(std::string_view(value, length));
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal128Type)
|
||||
template <typename T1 = T>
|
||||
enable_if_decimal128<T1, Status> Append(const Decimal128& value) {
|
||||
uint8_t data[16];
|
||||
value.ToBytes(data);
|
||||
return Append(data, 16);
|
||||
}
|
||||
|
||||
/// \brief Append a decimal (only for Decimal128Type)
|
||||
template <typename T1 = T>
|
||||
enable_if_decimal256<T1, Status> Append(const Decimal256& value) {
|
||||
uint8_t data[32];
|
||||
value.ToBytes(data);
|
||||
return Append(data, 32);
|
||||
}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
|
||||
if (!scalar.is_valid) return AppendNulls(n_repeats);
|
||||
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
|
||||
const DictionaryScalar& dict_scalar =
|
||||
internal::checked_cast<const DictionaryScalar&>(scalar);
|
||||
const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
|
||||
*dict_scalar.value.dictionary);
|
||||
ARROW_RETURN_NOT_OK(Reserve(n_repeats));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT8:
|
||||
return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT16:
|
||||
return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT16:
|
||||
return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT32:
|
||||
return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT32:
|
||||
return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::UINT64:
|
||||
return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
case Type::INT64:
|
||||
return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendScalars(const ScalarVector& scalars) override {
|
||||
for (const auto& scalar : scalars) {
|
||||
ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
// Visit the indices and insert the unpacked values.
|
||||
const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
|
||||
// See if possible to avoid using ToArrayData here
|
||||
const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
switch (dict_ty.index_type()->id()) {
|
||||
case Type::UINT8:
|
||||
return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
|
||||
case Type::INT8:
|
||||
return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
|
||||
case Type::UINT16:
|
||||
return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
|
||||
case Type::INT16:
|
||||
return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
|
||||
case Type::UINT32:
|
||||
return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
|
||||
case Type::INT32:
|
||||
return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
|
||||
case Type::UINT64:
|
||||
return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
|
||||
case Type::INT64:
|
||||
return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
|
||||
default:
|
||||
return Status::TypeError("Invalid index type: ", dict_ty);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Insert values into the dictionary's memo, but do not append any
|
||||
/// indices. Can be used to initialize a new builder with known dictionary
|
||||
/// values
|
||||
/// \param[in] values dictionary values to add to memo. Type must match
|
||||
/// builder type
|
||||
Status InsertMemoValues(const Array& values) {
|
||||
return memo_table_->InsertValues(values);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
template <typename T1 = T>
|
||||
enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
|
||||
const Array& array) {
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const ArrayType&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T1 = T>
|
||||
enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
value_type_, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
|
||||
const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
if (array.IsNull(i)) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
// Perform a partial reset. Call ResetFull to also reset the accumulated
|
||||
// dictionary values
|
||||
ArrayBuilder::Reset();
|
||||
indices_builder_.Reset();
|
||||
}
|
||||
|
||||
/// \brief Reset and also clear accumulated dictionary values in memo table
|
||||
void ResetFull() {
|
||||
Reset();
|
||||
memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Return dictionary indices and a delta dictionary since the last
|
||||
/// time that Finish or FinishDelta were called, and reset state of builder
|
||||
/// (except the memo table)
|
||||
Status FinishDelta(std::shared_ptr<Array>* out_indices,
|
||||
std::shared_ptr<Array>* out_delta) {
|
||||
std::shared_ptr<ArrayData> indices_data;
|
||||
std::shared_ptr<ArrayData> delta_data;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
|
||||
*out_indices = MakeArray(indices_data);
|
||||
*out_delta = MakeArray(delta_data);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), value_type_);
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename c_type>
|
||||
Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const ArraySpan& array, int64_t offset, int64_t length) {
|
||||
const c_type* values = array.GetValues<c_type>(1) + offset;
|
||||
return VisitBitBlocks(
|
||||
array.buffers[0].data, array.offset + offset, length,
|
||||
[&](const int64_t position) {
|
||||
const int64_t index = static_cast<int64_t>(values[position]);
|
||||
if (dict.IsValid(index)) {
|
||||
return Append(dict.GetView(index));
|
||||
}
|
||||
return AppendNull();
|
||||
},
|
||||
[&]() { return AppendNull(); });
|
||||
}
|
||||
|
||||
template <typename IndexType>
|
||||
Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
|
||||
const Scalar& index_scalar, int64_t n_repeats) {
|
||||
using ScalarType = typename TypeTraits<IndexType>::ScalarType;
|
||||
const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
|
||||
if (index_scalar.is_valid && dict.IsValid(index)) {
|
||||
const auto& value = dict.GetView(index);
|
||||
for (int64_t i = 0; i < n_repeats; i++) {
|
||||
ARROW_RETURN_NOT_OK(Append(value));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
return AppendNulls(n_repeats);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
|
||||
|
||||
// Set type of array data to the right dictionary type
|
||||
(*out)->type = type();
|
||||
(*out)->dictionary = dictionary;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishWithDictOffset(int64_t dict_offset,
|
||||
std::shared_ptr<ArrayData>* out_indices,
|
||||
std::shared_ptr<ArrayData>* out_dictionary) {
|
||||
// Finalize indices array
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
|
||||
|
||||
// Generate dictionary array from hash table contents
|
||||
ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
|
||||
delta_offset_ = memo_table_->size();
|
||||
|
||||
// Update internals for further uses of this DictionaryBuilder
|
||||
ArrayBuilder::Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<DictionaryMemoTable> memo_table_;
|
||||
|
||||
// The size of the dictionary memo at last invocation of Finish, to use in
|
||||
// FinishDelta for computing dictionary deltas
|
||||
int32_t delta_offset_;
|
||||
|
||||
// Only used for FixedSizeBinaryType
|
||||
int32_t byte_width_;
|
||||
|
||||
BuilderType indices_builder_;
|
||||
std::shared_ptr<DataType> value_type_;
|
||||
};
|
||||
|
||||
template <typename BuilderType>
|
||||
class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
|
||||
public:
|
||||
template <typename B = BuilderType>
|
||||
DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
|
||||
const std::shared_ptr<DataType>& value_type,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(index_type, pool) {}
|
||||
|
||||
template <typename B = BuilderType>
|
||||
explicit DictionaryBuilderBase(
|
||||
enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
|
||||
start_int_size,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: ArrayBuilder(pool), indices_builder_(pool) {}
|
||||
|
||||
/// \brief Append a scalar null value
|
||||
Status AppendNull() final {
|
||||
length_ += 1;
|
||||
null_count_ += 1;
|
||||
|
||||
return indices_builder_.AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
length_ += length;
|
||||
null_count_ += length;
|
||||
|
||||
return indices_builder_.AppendNulls(length);
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
length_ += 1;
|
||||
|
||||
return indices_builder_.AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
length_ += length;
|
||||
|
||||
return indices_builder_.AppendEmptyValues(length);
|
||||
}
|
||||
|
||||
/// \brief Append a whole dense array to the builder
|
||||
Status AppendArray(const Array& array) {
|
||||
#ifndef NDEBUG
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
|
||||
Type::NA, array, "Wrong value type of array to be appended"));
|
||||
#endif
|
||||
for (int64_t i = 0; i < array.length(); i++) {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
|
||||
capacity_ = indices_builder_.capacity();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
|
||||
(*out)->type = dictionary((*out)->type, null());
|
||||
(*out)->dictionary = NullArray(0).data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return ::arrow::dictionary(indices_builder_.type(), null());
|
||||
}
|
||||
|
||||
protected:
|
||||
BuilderType indices_builder_;
|
||||
};
|
||||
|
||||
} // namespace internal
|
||||
|
||||
/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
|
||||
/// smallest index size that can accommodate the dictionary indices
|
||||
template <typename T>
|
||||
class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int64_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief A DictionaryArray builder that always returns int32 dictionary
|
||||
/// indices so that data cast to dictionary form will have a consistent index
|
||||
/// type, e.g. for creating a ChunkedArray
|
||||
template <typename T>
|
||||
class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
|
||||
public:
|
||||
using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
|
||||
using BASE::BASE;
|
||||
|
||||
/// \brief Append dictionary indices directly without modifying memo
|
||||
///
|
||||
/// NOTE: Experimental API
|
||||
Status AppendIndices(const int32_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
int64_t null_count_before = this->indices_builder_.null_count();
|
||||
ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
|
||||
this->capacity_ = this->indices_builder_.capacity();
|
||||
this->length_ += length;
|
||||
this->null_count_ += this->indices_builder_.null_count() - null_count_before;
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Binary / Unicode builders
|
||||
// (compatibility aliases; those used to be derived classes with additional
|
||||
// Append() overloads, but they have been folded into DictionaryBuilderBase)
|
||||
|
||||
using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
|
||||
using StringDictionaryBuilder = DictionaryBuilder<StringType>;
|
||||
using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
|
||||
using StringDictionary32Builder = Dictionary32Builder<StringType>;
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,565 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// List builder
|
||||
|
||||
template <typename TYPE>
|
||||
class BaseListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = TYPE;
|
||||
using offset_type = typename TypeClass::offset_type;
|
||||
|
||||
/// Use this constructor to incrementally build the value array along with offsets and
|
||||
/// null bitmap.
|
||||
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
offsets_builder_(pool, alignment),
|
||||
value_builder_(value_builder),
|
||||
value_field_(type->field(0)->WithType(NULLPTR)) {}
|
||||
|
||||
BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BaseListBuilder(pool, value_builder, list(value_builder->type()), alignment) {}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
if (capacity > maximum_elements()) {
|
||||
return Status::CapacityError("List array cannot reserve space for more than ",
|
||||
maximum_elements(), " got ", capacity);
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
|
||||
// One more than requested for offsets
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
void Reset() override {
|
||||
ArrayBuilder::Reset();
|
||||
offsets_builder_.Reset();
|
||||
value_builder_->Reset();
|
||||
}
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const offset_type* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
offsets_builder_.UnsafeAppend(offsets, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Start a new variable-length list slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// value builder
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return AppendNextOffset();
|
||||
}
|
||||
|
||||
Status AppendNull() final { return Append(false); }
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
const int64_t num_values = value_builder_->length();
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final { return Append(true); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
const int64_t num_values = value_builder_->length();
|
||||
for (int64_t i = 0; i < length; ++i) {
|
||||
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const offset_type* offsets = array.GetValues<offset_type>(1);
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(array.child_data[0],
|
||||
offsets[row], slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_RETURN_NOT_OK(AppendNextOffset());
|
||||
|
||||
// Offset padding zeroed by BufferBuilder
|
||||
std::shared_ptr<Buffer> offsets, null_bitmap;
|
||||
ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
|
||||
ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
|
||||
|
||||
if (value_builder_->length() == 0) {
|
||||
// Try to make sure we get a non-null values buffer (ARROW-2744)
|
||||
ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> items;
|
||||
ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
|
||||
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, offsets}, {std::move(items)},
|
||||
null_count_);
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) const {
|
||||
auto new_length = value_builder_->length() + new_elements;
|
||||
if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
|
||||
return Status::CapacityError("List array cannot contain more than ",
|
||||
maximum_elements(), " elements, have ", new_elements);
|
||||
} else {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
|
||||
}
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<offset_type> offsets_builder_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
std::shared_ptr<Field> value_field_;
|
||||
|
||||
Status AppendNextOffset() {
|
||||
ARROW_RETURN_NOT_OK(ValidateOverflow(0));
|
||||
const int64_t num_values = value_builder_->length();
|
||||
return offsets_builder_.Append(static_cast<offset_type>(num_values));
|
||||
}
|
||||
};
|
||||
|
||||
/// \class ListBuilder
|
||||
/// \brief Builder class for variable-length list array value types
|
||||
///
|
||||
/// To use this class, you must append values to the child array builder and use
|
||||
/// the Append function to delimit each distinct list value (once the values
|
||||
/// have been appended to the child array) or use the bulk API to append
|
||||
/// a sequence of offsets and null values.
|
||||
///
|
||||
/// A note on types. Per arrow/type.h all types in the c++ implementation are
|
||||
/// logical so even though this class always builds list array, this can
|
||||
/// represent multiple different logical types. If no logical type is provided
|
||||
/// at construction time, the class defaults to List<T> where t is taken from the
|
||||
/// value_builder/values that the object is constructed with.
|
||||
class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \class LargeListBuilder
|
||||
/// \brief Builder class for large variable-length list array value types
|
||||
///
|
||||
/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
|
||||
class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
|
||||
public:
|
||||
using BaseListBuilder::BaseListBuilder;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Map builder
|
||||
|
||||
/// \class MapBuilder
|
||||
/// \brief Builder class for arrays of variable-size maps
|
||||
///
|
||||
/// To use this class, you must append values to the key and item array builders
|
||||
/// and use the Append function to delimit each distinct map (once the keys and items
|
||||
/// have been appended) or use the bulk API to append a sequence of offsets and null
|
||||
/// maps.
|
||||
///
|
||||
/// Key uniqueness and ordering are not validated.
|
||||
class ARROW_EXPORT MapBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If key_builder
|
||||
/// or item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If key_builder or
|
||||
/// item_builder has indeterminate type, this builder will also.
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
|
||||
const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
|
||||
|
||||
MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes is of equal length to values, and any zero byte
|
||||
/// will be considered as a null for that slot
|
||||
Status AppendValues(const int32_t* offsets, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Start a new variable-length map slot
|
||||
///
|
||||
/// This function should be called before beginning to append elements to the
|
||||
/// key and item builders
|
||||
Status Append();
|
||||
|
||||
Status AppendNull() final;
|
||||
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
const int32_t* offsets = array.GetValues<int32_t>(1);
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
const int64_t slot_length = offsets[row + 1] - offsets[row];
|
||||
// Add together the inner StructArray offset to the Map/List offset
|
||||
int64_t key_value_offset = array.child_data[0].offset + offsets[row];
|
||||
ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[0], key_value_offset, slot_length));
|
||||
ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
|
||||
array.child_data[0].child_data[1], key_value_offset, slot_length));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Get builder to append keys.
|
||||
///
|
||||
/// Append a key with this builder should be followed by appending
|
||||
/// an item or null value with item_builder().
|
||||
ArrayBuilder* key_builder() const { return key_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to append items
|
||||
///
|
||||
/// Appending an item with this builder should have been preceded
|
||||
/// by appending a key with key_builder().
|
||||
ArrayBuilder* item_builder() const { return item_builder_.get(); }
|
||||
|
||||
/// \brief Get builder to add Map entries as struct values.
|
||||
///
|
||||
/// This is used instead of key_builder()/item_builder() and allows
|
||||
/// the Map to be built as a list of struct values.
|
||||
ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
// Key and Item builder may update types, but they don't contain the field names,
|
||||
// so we need to reconstruct the type. (See ARROW-13735.)
|
||||
return std::make_shared<MapType>(
|
||||
field(entries_name_,
|
||||
struct_({field(key_name_, key_builder_->type(), false),
|
||||
field(item_name_, item_builder_->type(), item_nullable_)}),
|
||||
false),
|
||||
keys_sorted_);
|
||||
}
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements) {
|
||||
return list_builder_->ValidateOverflow(new_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
inline Status AdjustStructBuilderLength();
|
||||
|
||||
protected:
|
||||
bool keys_sorted_ = false;
|
||||
bool item_nullable_ = false;
|
||||
std::string entries_name_;
|
||||
std::string key_name_;
|
||||
std::string item_name_;
|
||||
std::shared_ptr<ListBuilder> list_builder_;
|
||||
std::shared_ptr<ArrayBuilder> key_builder_;
|
||||
std::shared_ptr<ArrayBuilder> item_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// FixedSizeList builder
|
||||
|
||||
/// \class FixedSizeListBuilder
|
||||
/// \brief Builder class for fixed-length list array value types
|
||||
class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// Use this constructor to define the built array's type explicitly. If value_builder
|
||||
/// has indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
int32_t list_size);
|
||||
|
||||
/// Use this constructor to infer the built array's type. If value_builder has
|
||||
/// indeterminate type, this builder will also.
|
||||
FixedSizeListBuilder(MemoryPool* pool,
|
||||
std::shared_ptr<ArrayBuilder> const& value_builder,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
Status Resize(int64_t capacity) override;
|
||||
void Reset() override;
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a valid fixed length list.
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder.
|
||||
Status Append();
|
||||
|
||||
/// \brief Vector append
|
||||
///
|
||||
/// If passed, valid_bytes wil be read and any zero byte
|
||||
/// will cause the corresponding slot to be null
|
||||
///
|
||||
/// This function affects only the validity bitmap; the child values must be appended
|
||||
/// using the child array builder. This includes appending nulls for null lists.
|
||||
/// XXX this restriction is confusing, should this method be omitted?
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a null fixed length list.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNull() final;
|
||||
|
||||
/// \brief Append length null fixed length lists.
|
||||
///
|
||||
/// The child array builder will have the appropriate number of nulls appended
|
||||
/// automatically.
|
||||
Status AppendNulls(int64_t length) final;
|
||||
|
||||
Status ValidateOverflow(int64_t new_elements);
|
||||
|
||||
Status AppendEmptyValue() final;
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final;
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
for (int64_t row = offset; row < offset + length; row++) {
|
||||
if (!validity || bit_util::GetBit(validity, array.offset + row)) {
|
||||
ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
|
||||
array.child_data[0], list_size_ * (array.offset + row), list_size_));
|
||||
ARROW_RETURN_NOT_OK(Append());
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(AppendNull());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
ArrayBuilder* value_builder() const { return value_builder_.get(); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override {
|
||||
return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
|
||||
}
|
||||
|
||||
// Cannot make this a static attribute because of linking issues
|
||||
static constexpr int64_t maximum_elements() {
|
||||
return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
|
||||
}
|
||||
|
||||
protected:
|
||||
std::shared_ptr<Field> value_field_;
|
||||
const int32_t list_size_;
|
||||
std::shared_ptr<ArrayBuilder> value_builder_;
|
||||
};
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Struct
|
||||
|
||||
// ---------------------------------------------------------------------------------
|
||||
// StructArray builder
|
||||
/// Append, Resize and Reserve methods are acting on StructBuilder.
|
||||
/// Please make sure all these methods of all child-builders' are consistently
|
||||
/// called to maintain data-structure consistency.
|
||||
class ARROW_EXPORT StructBuilder : public ArrayBuilder {
|
||||
public:
|
||||
/// If any of field_builders has indeterminate type, this builder will also
|
||||
StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// Null bitmap is of equal length to every child field, and any zero byte
|
||||
/// will be considered as a null for that field, but users must using app-
|
||||
/// end methods or advance methods of the child builders' independently to
|
||||
/// insert data.
|
||||
Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Append an element to the Struct. All child-builders' Append method must
|
||||
/// be called independently to maintain data-structure consistency.
|
||||
Status Append(bool is_valid = true) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a null value. Automatically appends an empty value to each child
|
||||
/// builder.
|
||||
Status AppendNull() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(false);
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values. Automatically appends empty values to each
|
||||
/// child builder.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
|
||||
}
|
||||
return Append(true);
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
for (const auto& field : children_) {
|
||||
ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
|
||||
}
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(length, true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
|
||||
ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
|
||||
array.offset + offset, length));
|
||||
}
|
||||
const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
UnsafeAppendToBitmap(validity, array.offset + offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Reset() override;
|
||||
|
||||
ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
|
||||
|
||||
int num_fields() const { return static_cast<int>(children_.size()); }
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
private:
|
||||
std::shared_ptr<DataType> type_;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,551 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/type_traits.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class ARROW_EXPORT NullBuilder : public ArrayBuilder {
|
||||
public:
|
||||
explicit NullBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool) {}
|
||||
explicit NullBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NullBuilder(pool, alignment) {}
|
||||
|
||||
/// \brief Append the specified number of null elements
|
||||
Status AppendNulls(int64_t length) final {
|
||||
if (length < 0) return Status::Invalid("length must be positive");
|
||||
null_count_ += length;
|
||||
length_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final { return AppendNulls(1); }
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
|
||||
|
||||
Status AppendEmptyValue() final { return AppendEmptyValues(1); }
|
||||
|
||||
Status Append(std::nullptr_t) { return AppendNull(); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
|
||||
return AppendNulls(length);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return null(); }
|
||||
|
||||
Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
|
||||
};
|
||||
|
||||
/// \addtogroup numeric-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// Base class for all Builders that emit an Array of a scalar numerical type.
|
||||
template <typename T>
|
||||
class NumericBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = T;
|
||||
using value_type = typename T::c_type;
|
||||
using ArrayType = typename TypeTraits<T>::ArrayType;
|
||||
|
||||
template <typename T1 = T>
|
||||
explicit NumericBuilder(
|
||||
enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment),
|
||||
type_(TypeTraits<T>::type_singleton()),
|
||||
data_builder_(pool, alignment) {}
|
||||
|
||||
NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {}
|
||||
|
||||
/// Append a single scalar and increase the size if necessary.
|
||||
Status Append(const value_type val) {
|
||||
ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
/// The memory at the corresponding data slot is set to 0 to prevent
|
||||
/// uninitialized memory access
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a single null element
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(false);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a empty element
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
UnsafeAppendToBitmap(true);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append several empty elements
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, value_type{}); // zero
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
|
||||
|
||||
void Reset() override {
|
||||
data_builder_.Reset();
|
||||
ArrayBuilder::Reset();
|
||||
}
|
||||
|
||||
Status Resize(int64_t capacity) override {
|
||||
ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
|
||||
capacity = std::max(capacity, kMinBuilderCapacity);
|
||||
ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
|
||||
return ArrayBuilder::Resize(capacity);
|
||||
}
|
||||
|
||||
value_type operator[](int64_t index) const { return GetValue(index); }
|
||||
|
||||
value_type& operator[](int64_t index) {
|
||||
return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] bitmap a validity bitmap to copy (may be null)
|
||||
/// \param[in] bitmap_offset an offset into the validity bitmap
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
|
||||
int64_t bitmap_offset) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const value_type* values, int64_t length,
|
||||
const std::vector<bool>& is_valid) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values, length);
|
||||
// length_ is update by these
|
||||
ArrayBuilder::UnsafeAppendToBitmap(is_valid);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values,
|
||||
const std::vector<bool>& is_valid) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<value_type>& values) {
|
||||
return AppendValues(values.data(), static_cast<int64_t>(values.size()));
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
|
||||
ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
|
||||
null_bitmap_builder_.FinishWithLength(length_));
|
||||
ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
|
||||
*out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
|
||||
capacity_ = length_ = null_count_ = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values.
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, with a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(values_begin, values_end);
|
||||
// this updates the length_
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<value_type>(1) + offset, length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
/// Append a single scalar under the assumption that the underlying Buffer is
|
||||
/// large enough.
|
||||
///
|
||||
/// This method does not capacity-check; make sure to call Reserve
|
||||
/// beforehand.
|
||||
void UnsafeAppend(const value_type val) {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(true);
|
||||
data_builder_.UnsafeAppend(val);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
ArrayBuilder::UnsafeAppendToBitmap(false);
|
||||
data_builder_.UnsafeAppend(value_type{}); // zero
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return type_; }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<DataType> type_;
|
||||
TypedBufferBuilder<value_type> data_builder_;
|
||||
};
|
||||
|
||||
// Builders
|
||||
|
||||
using UInt8Builder = NumericBuilder<UInt8Type>;
|
||||
using UInt16Builder = NumericBuilder<UInt16Type>;
|
||||
using UInt32Builder = NumericBuilder<UInt32Type>;
|
||||
using UInt64Builder = NumericBuilder<UInt64Type>;
|
||||
|
||||
using Int8Builder = NumericBuilder<Int8Type>;
|
||||
using Int16Builder = NumericBuilder<Int16Type>;
|
||||
using Int32Builder = NumericBuilder<Int32Type>;
|
||||
using Int64Builder = NumericBuilder<Int64Type>;
|
||||
|
||||
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
|
||||
using FloatBuilder = NumericBuilder<FloatType>;
|
||||
using DoubleBuilder = NumericBuilder<DoubleType>;
|
||||
|
||||
/// @}
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
using Date32Builder = NumericBuilder<Date32Type>;
|
||||
using Date64Builder = NumericBuilder<Date64Type>;
|
||||
using Time32Builder = NumericBuilder<Time32Type>;
|
||||
using Time64Builder = NumericBuilder<Time64Type>;
|
||||
using TimestampBuilder = NumericBuilder<TimestampType>;
|
||||
using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
|
||||
using DurationBuilder = NumericBuilder<DurationType>;
|
||||
|
||||
/// @}
|
||||
|
||||
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
|
||||
public:
|
||||
using TypeClass = BooleanType;
|
||||
using value_type = bool;
|
||||
|
||||
explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
BooleanBuilder(const std::shared_ptr<DataType>& type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment);
|
||||
|
||||
/// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
|
||||
Status AppendNulls(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendNull() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppendNull();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeSetNotNull(1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend(length, false);
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// Scalar append
|
||||
Status Append(const bool val) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(val);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t val) { return Append(val != 0); }
|
||||
|
||||
/// Scalar append, without checking for capacity
|
||||
void UnsafeAppend(const bool val) {
|
||||
data_builder_.UnsafeAppend(val);
|
||||
UnsafeAppendToBitmap(true);
|
||||
}
|
||||
|
||||
void UnsafeAppendNull() {
|
||||
data_builder_.UnsafeAppend(false);
|
||||
UnsafeAppendToBitmap(false);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous array of bytes (non-zero is 1)
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] valid_bytes an optional sequence of bytes where non-zero
|
||||
/// indicates a valid (non-null) value
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const uint8_t* valid_bytes = NULLPTR);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a bitmap of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] validity a validity bitmap to copy (may be null)
|
||||
/// \param[in] offset an offset into the values and validity bitmaps
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
|
||||
int64_t offset);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a contiguous C array of values
|
||||
/// \param[in] length the number of values to append
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const uint8_t* values, int64_t length,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values,
|
||||
const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values a std::vector of bytes
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<uint8_t>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
|
||||
/// (0). Equal in length to values
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values an std::vector<bool> indicating true (1) or false
|
||||
/// \return Status
|
||||
Status AppendValues(const std::vector<bool>& values);
|
||||
|
||||
/// \brief Append a sequence of elements in one shot
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter>
|
||||
Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
// this updates length_
|
||||
UnsafeSetNotNull(length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append a sequence of elements in one shot, with a specified nullmap
|
||||
/// \param[in] values_begin InputIterator to the beginning of the values
|
||||
/// \param[in] values_end InputIterator pointing to the end of the values
|
||||
/// \param[in] valid_begin InputIterator with elements indication valid(1)
|
||||
/// or null(0) values
|
||||
/// \return Status
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
static_assert(!internal::is_null_pointer<ValidIter>::value,
|
||||
"Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
|
||||
"version instead");
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Same as above, for a pointer type ValidIter
|
||||
template <typename ValuesIter, typename ValidIter>
|
||||
enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
|
||||
ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
|
||||
int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
data_builder_.UnsafeAppend<false>(
|
||||
length, [&values_begin]() -> bool { return *values_begin++; });
|
||||
|
||||
if (valid_begin == NULLPTR) {
|
||||
UnsafeSetNotNull(length);
|
||||
} else {
|
||||
null_bitmap_builder_.UnsafeAppend<true>(
|
||||
length, [&valid_begin]() -> bool { return *valid_begin++; });
|
||||
}
|
||||
length_ = null_bitmap_builder_.length();
|
||||
null_count_ = null_bitmap_builder_.false_count();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendValues(int64_t length, bool value);
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override {
|
||||
return AppendValues(array.GetValues<uint8_t>(1, 0), length,
|
||||
array.GetValues<uint8_t>(0, 0), array.offset + offset);
|
||||
}
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
|
||||
|
||||
void Reset() override;
|
||||
Status Resize(int64_t capacity) override;
|
||||
|
||||
std::shared_ptr<DataType> type() const override { return boolean(); }
|
||||
|
||||
protected:
|
||||
TypedBufferBuilder<bool> data_builder_;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,66 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
// Contains declarations of time related Arrow builder types.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/builder_primitive.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup temporal-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
// TODO(ARROW-7938): this class is untested
|
||||
|
||||
class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
|
||||
public:
|
||||
using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
|
||||
|
||||
explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {}
|
||||
|
||||
explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<DayTimeIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
class ARROW_EXPORT MonthDayNanoIntervalBuilder
|
||||
: public NumericBuilder<MonthDayNanoIntervalType> {
|
||||
public:
|
||||
using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {}
|
||||
|
||||
explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: NumericBuilder<MonthDayNanoIntervalType>(type, pool, alignment) {}
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,254 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/array/builder_base.h"
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/buffer_builder.h"
|
||||
#include "arrow/memory_pool.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \addtogroup nested-builders
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Base class for union array builds.
|
||||
///
|
||||
/// Note that while we subclass ArrayBuilder, as union types do not have a
|
||||
/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
|
||||
class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
|
||||
public:
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
/// \cond FALSE
|
||||
using ArrayBuilder::Finish;
|
||||
/// \endcond
|
||||
|
||||
Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
|
||||
|
||||
/// \brief Make a new child builder available to the UnionArray
|
||||
///
|
||||
/// \param[in] new_child the child builder
|
||||
/// \param[in] field_name the name of the field in the union array type
|
||||
/// if type inference is used
|
||||
/// \return child index, which is the "type" argument that needs
|
||||
/// to be passed to the "Append" method to add a new element to
|
||||
/// the union array.
|
||||
int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
|
||||
const std::string& field_name = "");
|
||||
|
||||
std::shared_ptr<DataType> type() const override;
|
||||
|
||||
int64_t length() const override { return types_builder_.length(); }
|
||||
|
||||
protected:
|
||||
BasicUnionBuilder(MemoryPool* pool, int64_t alignment,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
int8_t NextTypeId();
|
||||
|
||||
std::vector<std::shared_ptr<Field>> child_fields_;
|
||||
std::vector<int8_t> type_codes_;
|
||||
UnionMode::type mode_;
|
||||
|
||||
std::vector<ArrayBuilder*> type_id_to_children_;
|
||||
std::vector<int> type_id_to_child_id_;
|
||||
// for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
|
||||
int8_t dense_type_id_ = 0;
|
||||
TypedBufferBuilder<int8_t> types_builder_;
|
||||
};
|
||||
|
||||
/// \class DenseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit DenseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
DenseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type),
|
||||
offsets_builder_(pool, alignment) {}
|
||||
|
||||
Status AppendNull() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append a null arbitrarily to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single null to the first child
|
||||
return child_builder->AppendNull();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
|
||||
// Append an empty value arbitrarily to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
const int8_t first_child_code = type_codes_[0];
|
||||
ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(
|
||||
offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
|
||||
// Append just a single empty value to the first child
|
||||
return child_builder->AppendEmptyValue();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called.
|
||||
Status Append(int8_t next_type) {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
|
||||
if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
|
||||
return Status::CapacityError(
|
||||
"a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
|
||||
"child");
|
||||
}
|
||||
auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
|
||||
return offsets_builder_.Append(offset);
|
||||
}
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
|
||||
Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
|
||||
|
||||
private:
|
||||
TypedBufferBuilder<int32_t> offsets_builder_;
|
||||
};
|
||||
|
||||
/// \class SparseUnionBuilder
|
||||
///
|
||||
/// This API is EXPERIMENTAL.
|
||||
class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
|
||||
public:
|
||||
/// Use this constructor to initialize the UnionBuilder with no child builders,
|
||||
/// allowing type to be inferred. You will need to call AppendChild for each of the
|
||||
/// children builders you want to use.
|
||||
explicit SparseUnionBuilder(MemoryPool* pool,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {}
|
||||
|
||||
/// Use this constructor to specify the type explicitly.
|
||||
/// You can still add child builders to the union after using this constructor
|
||||
SparseUnionBuilder(MemoryPool* pool,
|
||||
const std::vector<std::shared_ptr<ArrayBuilder>>& children,
|
||||
const std::shared_ptr<DataType>& type,
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: BasicUnionBuilder(pool, alignment, children, type) {}
|
||||
|
||||
/// \brief Append a null value.
|
||||
///
|
||||
/// A null is appended to the first child, empty values to the other children.
|
||||
Status AppendNull() final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append multiple null values.
|
||||
///
|
||||
/// Nulls are appended to the first child, empty values to the other children.
|
||||
Status AppendNulls(int64_t length) final {
|
||||
const auto first_child_code = type_codes_[0];
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
|
||||
for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
|
||||
ARROW_RETURN_NOT_OK(
|
||||
type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValue() final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status AppendEmptyValues(int64_t length) final {
|
||||
ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
|
||||
for (int8_t code : type_codes_) {
|
||||
ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append an element to the UnionArray. This must be followed
|
||||
/// by an append to the appropriate child builder.
|
||||
///
|
||||
/// \param[in] next_type type_id of the child to which the next value will be appended.
|
||||
///
|
||||
/// The corresponding child builder must be appended to independently after this method
|
||||
/// is called, and all other child builders must have null or empty value appended.
|
||||
Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
|
||||
|
||||
Status AppendArraySlice(const ArraySpan& array, int64_t offset,
|
||||
int64_t length) override;
|
||||
};
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,37 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Concatenate arrays
|
||||
///
|
||||
/// \param[in] arrays a vector of arrays to be concatenated
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return the concatenated array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,391 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <atomic> // IWYU pragma: export
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
class Array;
|
||||
|
||||
// When slicing, we do not know the null count of the sliced range without
|
||||
// doing some computation. To avoid doing this eagerly, we set the null count
|
||||
// to -1 (any negative number will do). When Array::null_count is called the
|
||||
// first time, the null count will be computed. See ARROW-33
|
||||
constexpr int64_t kUnknownNullCount = -1;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Generic array data container
|
||||
|
||||
/// \class ArrayData
|
||||
/// \brief Mutable container for generic Arrow array data
|
||||
///
|
||||
/// This data structure is a self-contained representation of the memory and
|
||||
/// metadata inside an Arrow array data structure (called vectors in Java). The
|
||||
/// classes arrow::Array and its subclasses provide strongly-typed accessors
|
||||
/// with support for the visitor pattern and other affordances.
|
||||
///
|
||||
/// This class is designed for easy internal data manipulation, analytical data
|
||||
/// processing, and data transport to and from IPC messages. For example, we
|
||||
/// could cast from int64 to float64 like so:
|
||||
///
|
||||
/// Int64Array arr = GetMyData();
|
||||
/// auto new_data = arr.data()->Copy();
|
||||
/// new_data->type = arrow::float64();
|
||||
/// DoubleArray double_arr(new_data);
|
||||
///
|
||||
/// This object is also useful in an analytics setting where memory may be
|
||||
/// reused. For example, if we had a group of operations all returning doubles,
|
||||
/// say:
|
||||
///
|
||||
/// Log(Sqrt(Expr(arr)))
|
||||
///
|
||||
/// Then the low-level implementations of each of these functions could have
|
||||
/// the signatures
|
||||
///
|
||||
/// void Log(const ArrayData& values, ArrayData* out);
|
||||
///
|
||||
/// As another example a function may consume one or more memory buffers in an
|
||||
/// input array and replace them with newly-allocated data, changing the output
|
||||
/// data type as well.
|
||||
struct ARROW_EXPORT ArrayData {
|
||||
ArrayData() = default;
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
}
|
||||
|
||||
ArrayData(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
|
||||
: ArrayData(std::move(type), length, null_count, offset) {
|
||||
this->buffers = std::move(buffers);
|
||||
this->child_data = std::move(child_data);
|
||||
}
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(
|
||||
std::shared_ptr<DataType> type, int64_t length,
|
||||
std::vector<std::shared_ptr<Buffer>> buffers,
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data,
|
||||
std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
|
||||
int64_t null_count = kUnknownNullCount,
|
||||
int64_t offset = 0);
|
||||
|
||||
// Move constructor
|
||||
ArrayData(ArrayData&& other) noexcept
|
||||
: type(std::move(other.type)),
|
||||
length(other.length),
|
||||
offset(other.offset),
|
||||
buffers(std::move(other.buffers)),
|
||||
child_data(std::move(other.child_data)),
|
||||
dictionary(std::move(other.dictionary)) {
|
||||
SetNullCount(other.null_count);
|
||||
}
|
||||
|
||||
// Copy constructor
|
||||
ArrayData(const ArrayData& other) noexcept
|
||||
: type(other.type),
|
||||
length(other.length),
|
||||
offset(other.offset),
|
||||
buffers(other.buffers),
|
||||
child_data(other.child_data),
|
||||
dictionary(other.dictionary) {
|
||||
SetNullCount(other.null_count);
|
||||
}
|
||||
|
||||
// Move assignment
|
||||
ArrayData& operator=(ArrayData&& other) {
|
||||
type = std::move(other.type);
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = std::move(other.buffers);
|
||||
child_data = std::move(other.child_data);
|
||||
dictionary = std::move(other.dictionary);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Copy assignment
|
||||
ArrayData& operator=(const ArrayData& other) {
|
||||
type = other.type;
|
||||
length = other.length;
|
||||
SetNullCount(other.null_count);
|
||||
offset = other.offset;
|
||||
buffers = other.buffers;
|
||||
child_data = other.child_data;
|
||||
dictionary = other.dictionary;
|
||||
return *this;
|
||||
}
|
||||
|
||||
std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
|
||||
|
||||
bool IsNull(int64_t i) const {
|
||||
return ((buffers[0] != NULLPTR) ? !bit_util::GetBit(buffers[0]->data(), i + offset)
|
||||
: null_count.load() == length);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, offset);
|
||||
}
|
||||
|
||||
// Like GetValues, but returns NULLPTR instead of aborting if the underlying
|
||||
// buffer is not a CPU buffer.
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
|
||||
if (buffers[i] && buffers[i]->is_cpu()) {
|
||||
return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValuesSafe(int i) const {
|
||||
return GetValuesSafe<T>(i, offset);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i, int64_t absolute_offset) {
|
||||
if (buffers[i]) {
|
||||
return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T* GetMutableValues(int i) {
|
||||
return GetMutableValues<T>(i, offset);
|
||||
}
|
||||
|
||||
/// \brief Construct a zero-copy slice of the data with the given offset and length
|
||||
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
|
||||
|
||||
/// \brief Input-checking variant of Slice
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike Slice, `length` isn't clamped to the available buffer size.
|
||||
Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
|
||||
|
||||
void SetNullCount(int64_t v) { null_count.store(v); }
|
||||
|
||||
/// \brief Return null count, or compute and set it if it's not known
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count.load() != 0 && buffers[0] != NULLPTR;
|
||||
}
|
||||
|
||||
std::shared_ptr<DataType> type;
|
||||
int64_t length = 0;
|
||||
mutable std::atomic<int64_t> null_count{0};
|
||||
// The logical start point into the physical buffers (in values, not bytes).
|
||||
// Note that, for child data, this must be *added* to the child data's own offset.
|
||||
int64_t offset = 0;
|
||||
std::vector<std::shared_ptr<Buffer>> buffers;
|
||||
std::vector<std::shared_ptr<ArrayData>> child_data;
|
||||
|
||||
// The dictionary for this Array, if any. Only used for dictionary type
|
||||
std::shared_ptr<ArrayData> dictionary;
|
||||
};
|
||||
|
||||
/// \brief A non-owning Buffer reference
|
||||
struct ARROW_EXPORT BufferSpan {
|
||||
// It is the user of this class's responsibility to ensure that
|
||||
// buffers that were const originally are not written to
|
||||
// accidentally.
|
||||
uint8_t* data = NULLPTR;
|
||||
int64_t size = 0;
|
||||
// Pointer back to buffer that owns this memory
|
||||
const std::shared_ptr<Buffer>* owner = NULLPTR;
|
||||
};
|
||||
|
||||
/// \brief EXPERIMENTAL: A non-owning ArrayData reference that is cheaply
|
||||
/// copyable and does not contain any shared_ptr objects. Do not use in public
|
||||
/// APIs aside from compute kernels for now
|
||||
struct ARROW_EXPORT ArraySpan {
|
||||
const DataType* type = NULLPTR;
|
||||
int64_t length = 0;
|
||||
mutable int64_t null_count = kUnknownNullCount;
|
||||
int64_t offset = 0;
|
||||
BufferSpan buffers[3];
|
||||
|
||||
// 16 bytes of scratch space to enable this ArraySpan to be a view onto
|
||||
// scalar values including binary scalars (where we need to create a buffer
|
||||
// that looks like two 32-bit or 64-bit offsets)
|
||||
uint64_t scratch_space[2];
|
||||
|
||||
ArraySpan() = default;
|
||||
|
||||
explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
|
||||
|
||||
ArraySpan(const ArrayData& data) { // NOLINT implicit conversion
|
||||
SetMembers(data);
|
||||
}
|
||||
explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
|
||||
|
||||
/// If dictionary-encoded, put dictionary in the first entry
|
||||
std::vector<ArraySpan> child_data;
|
||||
|
||||
/// \brief Populate ArraySpan to look like an array of length 1 pointing at
|
||||
/// the data members of a Scalar value
|
||||
void FillFromScalar(const Scalar& value);
|
||||
|
||||
void SetMembers(const ArrayData& data);
|
||||
|
||||
void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
|
||||
this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
|
||||
this->buffers[index].size = buffer->size();
|
||||
this->buffers[index].owner = &buffer;
|
||||
}
|
||||
|
||||
const ArraySpan& dictionary() const { return child_data[0]; }
|
||||
|
||||
/// \brief Return the number of buffers (out of 3) that are used to
|
||||
/// constitute this array
|
||||
int num_buffers() const;
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline T* GetValues(int i, int64_t absolute_offset) {
|
||||
return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline T* GetValues(int i) {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
// Access a buffer's data as a typed C pointer
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i, int64_t absolute_offset) const {
|
||||
return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T* GetValues(int i) const {
|
||||
return GetValues<T>(i, this->offset);
|
||||
}
|
||||
|
||||
inline bool IsValid(int64_t i) const {
|
||||
return ((this->buffers[0].data != NULLPTR)
|
||||
? bit_util::GetBit(this->buffers[0].data, i + this->offset)
|
||||
: this->null_count != this->length);
|
||||
}
|
||||
|
||||
inline bool IsNull(int64_t i) const { return !IsValid(i); }
|
||||
|
||||
std::shared_ptr<ArrayData> ToArrayData() const;
|
||||
|
||||
std::shared_ptr<Array> ToArray() const;
|
||||
|
||||
std::shared_ptr<Buffer> GetBuffer(int index) const {
|
||||
const BufferSpan& buf = this->buffers[index];
|
||||
if (buf.owner) {
|
||||
return *buf.owner;
|
||||
} else if (buf.data != NULLPTR) {
|
||||
// Buffer points to some memory without an owning buffer
|
||||
return std::make_shared<Buffer>(buf.data, buf.size);
|
||||
} else {
|
||||
return NULLPTR;
|
||||
}
|
||||
}
|
||||
|
||||
void SetSlice(int64_t offset, int64_t length) {
|
||||
this->offset = offset;
|
||||
this->length = length;
|
||||
if (this->type->id() != Type::NA) {
|
||||
this->null_count = kUnknownNullCount;
|
||||
} else {
|
||||
this->null_count = this->length;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Return null count, or compute and set it if it's not known
|
||||
int64_t GetNullCount() const;
|
||||
|
||||
bool MayHaveNulls() const {
|
||||
// If an ArrayData is slightly malformed it may have kUnknownNullCount set
|
||||
// but no buffer
|
||||
return null_count != 0 && buffers[0].data != NULLPTR;
|
||||
}
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
void FillZeroLengthArray(const DataType* type, ArraySpan* span);
|
||||
|
||||
/// Construct a zero-copy view of this ArrayData with the given type.
|
||||
///
|
||||
/// This method checks if the types are layout-compatible.
|
||||
/// Nested types are traversed in depth-first order. Data buffers must have
|
||||
/// the same item sizes, even though the logical types may be different.
|
||||
/// An error is returned if the types are not layout-compatible.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
|
||||
const std::shared_ptr<DataType>& type);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,76 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <functional>
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
|
||||
#include "arrow/array/array_base.h"
|
||||
#include "arrow/array/array_nested.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Compare two arrays, returning an edit script which expresses the difference
|
||||
/// between them
|
||||
///
|
||||
/// An edit script is an array of struct(insert: bool, run_length: int64_t).
|
||||
/// Each element of "insert" determines whether an element was inserted into (true)
|
||||
/// or deleted from (false) base. Each insertion or deletion is followed by a run of
|
||||
/// elements which are unchanged from base to target; the length of this run is stored
|
||||
/// in "run_length". (Note that the edit script begins and ends with a run of shared
|
||||
/// elements but both fields of the struct must have the same length. To accommodate this
|
||||
/// the first element of "insert" should be ignored.)
|
||||
///
|
||||
/// For example for base "hlloo" and target "hello", the edit script would be
|
||||
/// [
|
||||
/// {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
|
||||
/// {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
|
||||
/// {"insert": false, "run_length": 0} // delete("o") then an empty run
|
||||
/// ]
|
||||
///
|
||||
/// Diffing arrays containing nulls is not currently supported.
|
||||
///
|
||||
/// \param[in] base baseline for comparison
|
||||
/// \param[in] target an array of identical type to base whose elements differ from base's
|
||||
/// \param[in] pool memory to store the result will be allocated from this memory pool
|
||||
/// \return an edit script array which can be applied to base to produce target
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief visitor interface for easy traversal of an edit script
|
||||
///
|
||||
/// visitor will be called for each hunk of insertions and deletions.
|
||||
ARROW_EXPORT Status VisitEditScript(
|
||||
const Array& edits,
|
||||
const std::function<Status(int64_t delete_begin, int64_t delete_end,
|
||||
int64_t insert_begin, int64_t insert_end)>& visitor);
|
||||
|
||||
/// \brief return a function which will format an edit script in unified
|
||||
/// diff format to os, given base and target arrays of type
|
||||
ARROW_EXPORT Result<
|
||||
std::function<Status(const Array& edits, const Array& base, const Array& target)>>
|
||||
MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,89 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/array/data.h"
|
||||
#include "arrow/compare.h"
|
||||
#include "arrow/result.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
/// \brief Create a strongly-typed Array instance from generic ArrayData
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting Array instance
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// \brief Create a strongly-typed Array instance with all elements null
|
||||
/// \param[in] type the array type
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
|
||||
int64_t length,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an Array instance whose slots are the given scalar
|
||||
/// \param[in] scalar the value with which to fill the array
|
||||
/// \param[in] length the array length
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeArrayFromScalar(
|
||||
const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
|
||||
|
||||
/// \brief Create an empty Array of a given type
|
||||
///
|
||||
/// The output Array will be of the given type.
|
||||
///
|
||||
/// \param[in] type the data type of the empty Array
|
||||
/// \param[in] pool the memory pool to allocate memory from
|
||||
/// \return the resulting Array
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
|
||||
MemoryPool* pool = default_memory_pool());
|
||||
|
||||
namespace internal {
|
||||
|
||||
/// \brief Swap endian of each element in a generic ArrayData
|
||||
///
|
||||
/// As dictionaries are often shared between different arrays, dictionaries
|
||||
/// are not swapped by this function and should be handled separately.
|
||||
///
|
||||
/// \param[in] data the array contents
|
||||
/// \return the resulting ArrayData whose elements were swapped
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
|
||||
const std::shared_ptr<ArrayData>& data);
|
||||
|
||||
/// Given a number of ArrayVectors, treat each ArrayVector as the
|
||||
/// chunks of a chunked array. Then rechunk each ArrayVector such that
|
||||
/// all ArrayVectors are chunked identically. It is mandatory that
|
||||
/// all ArrayVectors contain the same total number of elements.
|
||||
ARROW_EXPORT
|
||||
std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,56 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
namespace internal {
|
||||
|
||||
// Internal functions implementing Array::Validate() and friends.
|
||||
|
||||
// O(1) array metadata validation
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArray(const ArrayData& data);
|
||||
|
||||
// O(N) array data validation.
|
||||
// Note that, starting from 7.0.0, "full" routines also validate metadata.
|
||||
// Before, ValidateArray() needed to be called before ValidateArrayFull()
|
||||
// to ensure metadata correctness, otherwise invalid memory accesses
|
||||
// may occur.
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateArrayFull(const ArrayData& data);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const Array& array);
|
||||
|
||||
ARROW_EXPORT
|
||||
Status ValidateUTF8(const ArrayData& data);
|
||||
|
||||
} // namespace internal
|
||||
} // namespace arrow
|
||||
510
venv/lib/python3.9/site-packages/pyarrow/include/arrow/buffer.h
Normal file
510
venv/lib/python3.9/site-packages/pyarrow/include/arrow/buffer.h
Normal file
@@ -0,0 +1,510 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "arrow/device.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/type_fwd.h"
|
||||
#include "arrow/util/bytes_view.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Buffer classes
|
||||
|
||||
/// \class Buffer
|
||||
/// \brief Object containing a pointer to a piece of contiguous memory with a
|
||||
/// particular size.
|
||||
///
|
||||
/// Buffers have two related notions of length: size and capacity. Size is
|
||||
/// the number of bytes that might have valid data. Capacity is the number
|
||||
/// of bytes that were allocated for the buffer in total.
|
||||
///
|
||||
/// The Buffer base class does not own its memory, but subclasses often do.
|
||||
///
|
||||
/// The following invariant is always true: Size <= Capacity
|
||||
class ARROW_EXPORT Buffer {
|
||||
public:
|
||||
/// \brief Construct from buffer and size without copying memory
|
||||
///
|
||||
/// \param[in] data a memory buffer
|
||||
/// \param[in] size buffer size
|
||||
///
|
||||
/// \note The passed memory must be kept alive through some other means
|
||||
Buffer(const uint8_t* data, int64_t size)
|
||||
: is_mutable_(false), is_cpu_(true), data_(data), size_(size), capacity_(size) {
|
||||
SetMemoryManager(default_cpu_memory_manager());
|
||||
}
|
||||
|
||||
Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
|
||||
std::shared_ptr<Buffer> parent = NULLPTR)
|
||||
: is_mutable_(false), data_(data), size_(size), capacity_(size), parent_(parent) {
|
||||
SetMemoryManager(std::move(mm));
|
||||
}
|
||||
|
||||
Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
|
||||
std::shared_ptr<Buffer> parent = NULLPTR)
|
||||
: Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
|
||||
std::move(parent)) {}
|
||||
|
||||
/// \brief Construct from string_view without copying memory
|
||||
///
|
||||
/// \param[in] data a string_view object
|
||||
///
|
||||
/// \note The memory viewed by data must not be deallocated in the lifetime of the
|
||||
/// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
|
||||
explicit Buffer(std::string_view data)
|
||||
: Buffer(reinterpret_cast<const uint8_t*>(data.data()),
|
||||
static_cast<int64_t>(data.size())) {}
|
||||
|
||||
virtual ~Buffer() = default;
|
||||
|
||||
/// An offset into data that is owned by another buffer, but we want to be
|
||||
/// able to retain a valid pointer to it even after other shared_ptr's to the
|
||||
/// parent buffer have been destroyed
|
||||
///
|
||||
/// This method makes no assertions about alignment or padding of the buffer but
|
||||
/// in general we expected buffers to be aligned and padded to 64 bytes. In the future
|
||||
/// we might add utility methods to help determine if a buffer satisfies this contract.
|
||||
Buffer(const std::shared_ptr<Buffer>& parent, const int64_t offset, const int64_t size)
|
||||
: Buffer(parent->data_ + offset, size) {
|
||||
parent_ = parent;
|
||||
SetMemoryManager(parent->memory_manager_);
|
||||
}
|
||||
|
||||
uint8_t operator[](std::size_t i) const { return data_[i]; }
|
||||
|
||||
/// \brief Construct a new std::string with a hexadecimal representation of the buffer.
|
||||
/// \return std::string
|
||||
std::string ToHexString();
|
||||
|
||||
/// Return true if both buffers are the same size and contain the same bytes
|
||||
/// up to the number of compared bytes
|
||||
bool Equals(const Buffer& other, int64_t nbytes) const;
|
||||
|
||||
/// Return true if both buffers are the same size and contain the same bytes
|
||||
bool Equals(const Buffer& other) const;
|
||||
|
||||
/// Copy a section of the buffer into a new Buffer.
|
||||
Result<std::shared_ptr<Buffer>> CopySlice(
|
||||
const int64_t start, const int64_t nbytes,
|
||||
MemoryPool* pool = default_memory_pool()) const;
|
||||
|
||||
/// Zero bytes in padding, i.e. bytes between size_ and capacity_.
|
||||
void ZeroPadding() {
|
||||
#ifndef NDEBUG
|
||||
CheckMutable();
|
||||
#endif
|
||||
// A zero-capacity buffer can have a null data pointer
|
||||
if (capacity_ != 0) {
|
||||
memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Construct an immutable buffer that takes ownership of the contents
|
||||
/// of an std::string (without copying it).
|
||||
///
|
||||
/// \param[in] data a string to own
|
||||
/// \return a new Buffer instance
|
||||
static std::shared_ptr<Buffer> FromString(std::string data);
|
||||
|
||||
/// \brief Create buffer referencing typed memory with some length without
|
||||
/// copying
|
||||
/// \param[in] data the typed memory as C array
|
||||
/// \param[in] length the number of values in the array
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T, typename SizeType = int64_t>
|
||||
static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
|
||||
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
|
||||
static_cast<int64_t>(sizeof(T) * length));
|
||||
}
|
||||
|
||||
/// \brief Create buffer referencing std::vector with some length without
|
||||
/// copying
|
||||
/// \param[in] data the vector to be referenced. If this vector is changed,
|
||||
/// the buffer may become invalid
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T>
|
||||
static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
|
||||
return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
|
||||
static_cast<int64_t>(sizeof(T) * data.size()));
|
||||
}
|
||||
|
||||
/// \brief Copy buffer contents into a new std::string
|
||||
/// \return std::string
|
||||
/// \note Can throw std::bad_alloc if buffer is large
|
||||
std::string ToString() const;
|
||||
|
||||
/// \brief View buffer contents as a std::string_view
|
||||
/// \return std::string_view
|
||||
explicit operator std::string_view() const {
|
||||
return std::string_view(reinterpret_cast<const char*>(data_), size_);
|
||||
}
|
||||
|
||||
/// \brief View buffer contents as a util::bytes_view
|
||||
/// \return util::bytes_view
|
||||
explicit operator util::bytes_view() const { return util::bytes_view(data_, size_); }
|
||||
|
||||
/// \brief Return a pointer to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a CPU buffer (`is_cpu()` is true).
|
||||
/// Otherwise, an assertion may be thrown or a null pointer may be returned.
|
||||
///
|
||||
/// To get the buffer's data address regardless of its device, call `address()`.
|
||||
const uint8_t* data() const {
|
||||
#ifndef NDEBUG
|
||||
CheckCPU();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return a writable pointer to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
|
||||
/// are true). Otherwise, an assertion may be thrown or a null pointer may
|
||||
/// be returned.
|
||||
///
|
||||
/// To get the buffer's mutable data address regardless of its device, call
|
||||
/// `mutable_address()`.
|
||||
uint8_t* mutable_data() {
|
||||
#ifndef NDEBUG
|
||||
CheckCPU();
|
||||
CheckMutable();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
|
||||
: NULLPTR;
|
||||
}
|
||||
|
||||
/// \brief Return the device address of the buffer's data
|
||||
uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
|
||||
|
||||
/// \brief Return a writable device address to the buffer's data
|
||||
///
|
||||
/// The buffer has to be a mutable buffer (`is_mutable()` is true).
|
||||
/// Otherwise, an assertion may be thrown or 0 may be returned.
|
||||
uintptr_t mutable_address() const {
|
||||
#ifndef NDEBUG
|
||||
CheckMutable();
|
||||
#endif
|
||||
return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
|
||||
}
|
||||
|
||||
/// \brief Return the buffer's size in bytes
|
||||
int64_t size() const { return size_; }
|
||||
|
||||
/// \brief Return the buffer's capacity (number of allocated bytes)
|
||||
int64_t capacity() const { return capacity_; }
|
||||
|
||||
/// \brief Whether the buffer is directly CPU-accessible
|
||||
///
|
||||
/// If this function returns true, you can read directly from the buffer's
|
||||
/// `data()` pointer. Otherwise, you'll have to `View()` or `Copy()` it.
|
||||
bool is_cpu() const { return is_cpu_; }
|
||||
|
||||
/// \brief Whether the buffer is mutable
|
||||
///
|
||||
/// If this function returns true, you are allowed to modify buffer contents
|
||||
/// using the pointer returned by `mutable_data()` or `mutable_address()`.
|
||||
bool is_mutable() const { return is_mutable_; }
|
||||
|
||||
const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }
|
||||
|
||||
const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }
|
||||
|
||||
std::shared_ptr<Buffer> parent() const { return parent_; }
|
||||
|
||||
/// \brief Get a RandomAccessFile for reading a buffer
|
||||
///
|
||||
/// The returned file object reads from this buffer's underlying memory.
|
||||
static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \brief Get a OutputStream for writing to a buffer
|
||||
///
|
||||
/// The buffer must be mutable. The returned stream object writes into the buffer's
|
||||
/// underlying memory (but it won't resize it).
|
||||
static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);
|
||||
|
||||
/// \brief Copy buffer
|
||||
///
|
||||
/// The buffer contents will be copied into a new buffer allocated by the
|
||||
/// given MemoryManager. This function supports cross-device copies.
|
||||
static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
|
||||
const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief Copy a non-owned buffer
|
||||
///
|
||||
/// This is useful for cases where the source memory area is externally managed
|
||||
/// (its lifetime not tied to the source Buffer), otherwise please use Copy().
|
||||
static Result<std::unique_ptr<Buffer>> CopyNonOwned(
|
||||
const Buffer& source, const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief View buffer
|
||||
///
|
||||
/// Return a Buffer that reflects this buffer, seen potentially from another
|
||||
/// device, without making an explicit copy of the contents. The underlying
|
||||
/// mechanism is typically implemented by the kernel or device driver, and may
|
||||
/// involve lazy caching of parts of the buffer contents on the destination
|
||||
/// device's memory.
|
||||
///
|
||||
/// If a non-copy view is unsupported for the buffer on the given device,
|
||||
/// nullptr is returned. An error can be returned if some low-level
|
||||
/// operation fails (such as an out-of-memory condition).
|
||||
static Result<std::shared_ptr<Buffer>> View(std::shared_ptr<Buffer> source,
|
||||
const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
/// \brief View or copy buffer
|
||||
///
|
||||
/// Try to view buffer contents on the given MemoryManager's device, but
|
||||
/// fall back to copying if a no-copy view isn't supported.
|
||||
static Result<std::shared_ptr<Buffer>> ViewOrCopy(
|
||||
std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to);
|
||||
|
||||
protected:
|
||||
bool is_mutable_;
|
||||
bool is_cpu_;
|
||||
const uint8_t* data_;
|
||||
int64_t size_;
|
||||
int64_t capacity_;
|
||||
|
||||
// null by default, but may be set
|
||||
std::shared_ptr<Buffer> parent_;
|
||||
|
||||
private:
|
||||
// private so that subclasses are forced to call SetMemoryManager()
|
||||
std::shared_ptr<MemoryManager> memory_manager_;
|
||||
|
||||
protected:
|
||||
void CheckMutable() const;
|
||||
void CheckCPU() const;
|
||||
|
||||
void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
|
||||
memory_manager_ = std::move(mm);
|
||||
is_cpu_ = memory_manager_->is_cpu();
|
||||
}
|
||||
|
||||
private:
|
||||
Buffer() = delete;
|
||||
ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
|
||||
};
|
||||
|
||||
/// \defgroup buffer-slicing-functions Functions for slicing buffers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Construct a view on a buffer at the given offset and length.
|
||||
///
|
||||
/// This function cannot fail and does not check for errors (except in debug builds)
|
||||
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
|
||||
const int64_t offset,
|
||||
const int64_t length) {
|
||||
return std::make_shared<Buffer>(buffer, offset, length);
|
||||
}
|
||||
|
||||
/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
|
||||
///
|
||||
/// This function cannot fail and does not check for errors (except in debug builds)
|
||||
static inline std::shared_ptr<Buffer> SliceBuffer(const std::shared_ptr<Buffer>& buffer,
|
||||
const int64_t offset) {
|
||||
int64_t length = buffer->size() - offset;
|
||||
return SliceBuffer(buffer, offset, length);
|
||||
}
|
||||
|
||||
/// \brief Input-checking version of SliceBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
|
||||
int64_t offset);
|
||||
/// \brief Input-checking version of SliceBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceBufferSafe(const std::shared_ptr<Buffer>& buffer,
|
||||
int64_t offset, int64_t length);
|
||||
|
||||
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
|
||||
///
|
||||
/// If the parent buffer is not mutable, behavior is undefined (it may abort
|
||||
/// in debug builds).
|
||||
ARROW_EXPORT
|
||||
std::shared_ptr<Buffer> SliceMutableBuffer(const std::shared_ptr<Buffer>& buffer,
|
||||
const int64_t offset, const int64_t length);
|
||||
|
||||
/// \brief Like SliceBuffer, but construct a mutable buffer slice.
|
||||
///
|
||||
/// If the parent buffer is not mutable, behavior is undefined (it may abort
|
||||
/// in debug builds).
|
||||
static inline std::shared_ptr<Buffer> SliceMutableBuffer(
|
||||
const std::shared_ptr<Buffer>& buffer, const int64_t offset) {
|
||||
int64_t length = buffer->size() - offset;
|
||||
return SliceMutableBuffer(buffer, offset, length);
|
||||
}
|
||||
|
||||
/// \brief Input-checking version of SliceMutableBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
|
||||
const std::shared_ptr<Buffer>& buffer, int64_t offset);
|
||||
/// \brief Input-checking version of SliceMutableBuffer
|
||||
///
|
||||
/// An Invalid Status is returned if the requested slice falls out of bounds.
|
||||
/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(
|
||||
const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length);
|
||||
|
||||
/// @}
|
||||
|
||||
/// \class MutableBuffer
|
||||
/// \brief A Buffer whose contents can be mutated. May or may not own its data.
|
||||
class ARROW_EXPORT MutableBuffer : public Buffer {
|
||||
public:
|
||||
MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) {
|
||||
is_mutable_ = true;
|
||||
}
|
||||
|
||||
MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr<MemoryManager> mm)
|
||||
: Buffer(data, size, std::move(mm)) {
|
||||
is_mutable_ = true;
|
||||
}
|
||||
|
||||
MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
|
||||
const int64_t size);
|
||||
|
||||
/// \brief Create buffer referencing typed memory with some length
|
||||
/// \param[in] data the typed memory as C array
|
||||
/// \param[in] length the number of values in the array
|
||||
/// \return a new shared_ptr<Buffer>
|
||||
template <typename T, typename SizeType = int64_t>
|
||||
static std::shared_ptr<Buffer> Wrap(T* data, SizeType length) {
|
||||
return std::make_shared<MutableBuffer>(reinterpret_cast<uint8_t*>(data),
|
||||
static_cast<int64_t>(sizeof(T) * length));
|
||||
}
|
||||
|
||||
protected:
|
||||
MutableBuffer() : Buffer(NULLPTR, 0) {}
|
||||
};
|
||||
|
||||
/// \class ResizableBuffer
|
||||
/// \brief A mutable buffer that can be resized
|
||||
class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
|
||||
public:
|
||||
/// Change buffer reported size to indicated size, allocating memory if
|
||||
/// necessary. This will ensure that the capacity of the buffer is a multiple
|
||||
/// of 64 bytes as defined in Layout.md.
|
||||
/// Consider using ZeroPadding afterwards, to conform to the Arrow layout
|
||||
/// specification.
|
||||
///
|
||||
/// @param new_size The new size for the buffer.
|
||||
/// @param shrink_to_fit Whether to shrink the capacity if new size < current size
|
||||
virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0;
|
||||
Status Resize(const int64_t new_size) {
|
||||
return Resize(new_size, /*shrink_to_fit=*/true);
|
||||
}
|
||||
|
||||
/// Ensure that buffer has enough memory allocated to fit the indicated
|
||||
/// capacity (and meets the 64 byte padding requirement in Layout.md).
|
||||
/// It does not change buffer's reported size and doesn't zero the padding.
|
||||
virtual Status Reserve(const int64_t new_capacity) = 0;
|
||||
|
||||
template <class T>
|
||||
Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) {
|
||||
return Resize(sizeof(T) * new_nb_elements, shrink_to_fit);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
Status TypedReserve(const int64_t new_nb_elements) {
|
||||
return Reserve(sizeof(T) * new_nb_elements);
|
||||
}
|
||||
|
||||
protected:
|
||||
ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
|
||||
ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm)
|
||||
: MutableBuffer(data, size, std::move(mm)) {}
|
||||
};
|
||||
|
||||
/// \defgroup buffer-allocation-functions Functions for allocating buffers
|
||||
///
|
||||
/// @{
|
||||
|
||||
/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
|
||||
///
|
||||
/// \param[in] size size of buffer to allocate
|
||||
/// \param[in] pool a memory pool
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, int64_t alignment,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a resizeable buffer from a memory pool, zero its padding.
|
||||
///
|
||||
/// \param[in] size size of buffer to allocate
|
||||
/// \param[in] pool a memory pool
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
|
||||
const int64_t size, MemoryPool* pool = NULLPTR);
|
||||
ARROW_EXPORT
|
||||
Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
|
||||
const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a bitmap buffer from a memory pool
|
||||
/// no guarantee on values is provided.
|
||||
///
|
||||
/// \param[in] length size in bits of bitmap to allocate
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Allocate a zero-initialized bitmap buffer from a memory pool
|
||||
///
|
||||
/// \param[in] length size in bits of bitmap to allocate
|
||||
/// \param[in] pool memory pool to allocate memory from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length, int64_t alignment,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// \brief Concatenate multiple buffers into a single buffer
|
||||
///
|
||||
/// \param[in] buffers to be concatenated
|
||||
/// \param[in] pool memory pool to allocate the new buffer from
|
||||
ARROW_EXPORT
|
||||
Result<std::shared_ptr<Buffer>> ConcatenateBuffers(const BufferVector& buffers,
|
||||
MemoryPool* pool = NULLPTR);
|
||||
|
||||
/// @}
|
||||
|
||||
} // namespace arrow
|
||||
@@ -0,0 +1,467 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "arrow/buffer.h"
|
||||
#include "arrow/status.h"
|
||||
#include "arrow/util/bit_util.h"
|
||||
#include "arrow/util/bitmap_generate.h"
|
||||
#include "arrow/util/bitmap_ops.h"
|
||||
#include "arrow/util/macros.h"
|
||||
#include "arrow/util/ubsan.h"
|
||||
#include "arrow/util/visibility.h"
|
||||
|
||||
namespace arrow {
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// Buffer builder classes
|
||||
|
||||
/// \class BufferBuilder
|
||||
/// \brief A class for incrementally building a contiguous chunk of in-memory
|
||||
/// data
|
||||
class ARROW_EXPORT BufferBuilder {
|
||||
public:
|
||||
explicit BufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: pool_(pool),
|
||||
data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
|
||||
util::MakeNonNull<uint8_t>()),
|
||||
capacity_(0),
|
||||
size_(0),
|
||||
alignment_(alignment) {}
|
||||
|
||||
/// \brief Constructs new Builder that will start using
|
||||
/// the provided buffer until Finish/Reset are called.
|
||||
/// The buffer is not resized.
|
||||
explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
|
||||
MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: buffer_(std::move(buffer)),
|
||||
pool_(pool),
|
||||
data_(buffer_->mutable_data()),
|
||||
capacity_(buffer_->capacity()),
|
||||
size_(buffer_->size()),
|
||||
alignment_(alignment) {}
|
||||
|
||||
/// \brief Resize the buffer to the nearest multiple of 64 bytes
|
||||
///
|
||||
/// \param new_capacity the new capacity of the of the builder. Will be
|
||||
/// rounded up to a multiple of 64 bytes for padding
|
||||
/// \param shrink_to_fit if new capacity is smaller than the existing,
|
||||
/// reallocate internal buffer. Set to false to avoid reallocations when
|
||||
/// shrinking the builder.
|
||||
/// \return Status
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
if (buffer_ == NULLPTR) {
|
||||
ARROW_ASSIGN_OR_RAISE(buffer_,
|
||||
AllocateResizableBuffer(new_capacity, alignment_, pool_));
|
||||
} else {
|
||||
ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
|
||||
}
|
||||
capacity_ = buffer_->capacity();
|
||||
data_ = buffer_->mutable_data();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Ensure that builder can accommodate the additional number of bytes
|
||||
/// without the need to perform allocations
|
||||
///
|
||||
/// \param[in] additional_bytes number of additional bytes to make space for
|
||||
/// \return Status
|
||||
Status Reserve(const int64_t additional_bytes) {
|
||||
auto min_capacity = size_ + additional_bytes;
|
||||
if (min_capacity <= capacity_) {
|
||||
return Status::OK();
|
||||
}
|
||||
return Resize(GrowByFactor(capacity_, min_capacity), false);
|
||||
}
|
||||
|
||||
/// \brief Return a capacity expanded by the desired growth factor
|
||||
static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
|
||||
// Doubling capacity except for large Reserve requests. 2x growth strategy
|
||||
// (versus 1.5x) seems to have slightly better performance when using
|
||||
// jemalloc, but significantly better performance when using the system
|
||||
// allocator. See ARROW-6450 for further discussion
|
||||
return std::max(new_capacity, current_capacity * 2);
|
||||
}
|
||||
|
||||
/// \brief Append the given data to the buffer
|
||||
///
|
||||
/// The buffer is automatically expanded if necessary.
|
||||
Status Append(const void* data, const int64_t length) {
|
||||
if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
|
||||
ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
|
||||
}
|
||||
UnsafeAppend(data, length);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
/// \brief Append copies of a value to the buffer
|
||||
///
|
||||
/// The buffer is automatically expanded if necessary.
|
||||
Status Append(const int64_t num_copies, uint8_t value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Advance pointer and zero out memory
|
||||
Status Advance(const int64_t length) { return Append(length, 0); }
|
||||
|
||||
// Advance pointer, but don't allocate or zero memory
|
||||
void UnsafeAdvance(const int64_t length) { size_ += length; }
|
||||
|
||||
// Unsafe methods don't check existing size
|
||||
void UnsafeAppend(const void* data, const int64_t length) {
|
||||
memcpy(data_ + size_, data, static_cast<size_t>(length));
|
||||
size_ += length;
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, uint8_t value) {
|
||||
memset(data_ + size_, value, static_cast<size_t>(num_copies));
|
||||
size_ += num_copies;
|
||||
}
|
||||
|
||||
/// \brief Return result of builder as a Buffer object.
|
||||
///
|
||||
/// The builder is reset and can be reused afterwards.
|
||||
///
|
||||
/// \param[out] out the finalized Buffer object
|
||||
/// \param shrink_to_fit if the buffer size is smaller than its capacity,
|
||||
/// reallocate to fit more tightly in memory. Set to false to avoid
|
||||
/// a reallocation, at the expense of potentially more memory consumption.
|
||||
/// \return Status
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
|
||||
if (size_ != 0) buffer_->ZeroPadding();
|
||||
*out = buffer_;
|
||||
if (*out == NULLPTR) {
|
||||
ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, alignment_, pool_));
|
||||
}
|
||||
Reset();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using BufferBuilder
|
||||
/// mostly for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
size_ = final_length;
|
||||
return Finish(shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
buffer_ = NULLPTR;
|
||||
capacity_ = size_ = 0;
|
||||
}
|
||||
|
||||
/// \brief Set size to a smaller value without modifying builder
|
||||
/// contents. For reusable BufferBuilder classes
|
||||
/// \param[in] position must be non-negative and less than or equal
|
||||
/// to the current length()
|
||||
void Rewind(int64_t position) { size_ = position; }
|
||||
|
||||
int64_t capacity() const { return capacity_; }
|
||||
int64_t length() const { return size_; }
|
||||
const uint8_t* data() const { return data_; }
|
||||
uint8_t* mutable_data() { return data_; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<ResizableBuffer> buffer_;
|
||||
MemoryPool* pool_;
|
||||
uint8_t* data_;
|
||||
int64_t capacity_;
|
||||
int64_t size_;
|
||||
int64_t alignment_;
|
||||
};
|
||||
|
||||
template <typename T, typename Enable = void>
|
||||
class TypedBufferBuilder;
|
||||
|
||||
/// \brief A BufferBuilder for building a buffer of arithmetic elements
|
||||
template <typename T>
|
||||
class TypedBufferBuilder<
|
||||
T, typename std::enable_if<std::is_arithmetic<T>::value ||
|
||||
std::is_standard_layout<T>::value>::type> {
|
||||
public:
|
||||
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: bytes_builder_(pool, alignment) {}
|
||||
|
||||
explicit TypedBufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
|
||||
MemoryPool* pool = default_memory_pool())
|
||||
: bytes_builder_(std::move(buffer), pool) {}
|
||||
|
||||
explicit TypedBufferBuilder(BufferBuilder builder)
|
||||
: bytes_builder_(std::move(builder)) {}
|
||||
|
||||
BufferBuilder* bytes_builder() { return &bytes_builder_; }
|
||||
|
||||
Status Append(T value) {
|
||||
return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
|
||||
}
|
||||
|
||||
Status Append(const T* values, int64_t num_elements) {
|
||||
return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
|
||||
num_elements * sizeof(T));
|
||||
}
|
||||
|
||||
Status Append(const int64_t num_copies, T value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies + length()));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppend(T value) {
|
||||
bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
|
||||
}
|
||||
|
||||
void UnsafeAppend(const T* values, int64_t num_elements) {
|
||||
bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
|
||||
num_elements * sizeof(T));
|
||||
}
|
||||
|
||||
template <typename Iter>
|
||||
void UnsafeAppend(Iter values_begin, Iter values_end) {
|
||||
int64_t num_elements = static_cast<int64_t>(std::distance(values_begin, values_end));
|
||||
auto data = mutable_data() + length();
|
||||
bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
|
||||
std::copy(values_begin, values_end, data);
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, T value) {
|
||||
auto data = mutable_data() + length();
|
||||
bytes_builder_.UnsafeAdvance(num_copies * sizeof(T));
|
||||
std::fill(data, data + num_copies, value);
|
||||
}
|
||||
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
|
||||
}
|
||||
|
||||
Status Reserve(const int64_t additional_elements) {
|
||||
return bytes_builder_.Reserve(additional_elements * sizeof(T));
|
||||
}
|
||||
|
||||
Status Advance(const int64_t length) {
|
||||
return bytes_builder_.Advance(length * sizeof(T));
|
||||
}
|
||||
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
return bytes_builder_.Finish(out, shrink_to_fit);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using TypedBufferBuilder
|
||||
/// only for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() { bytes_builder_.Reset(); }
|
||||
|
||||
int64_t length() const { return bytes_builder_.length() / sizeof(T); }
|
||||
int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
|
||||
const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
|
||||
T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
|
||||
|
||||
private:
|
||||
BufferBuilder bytes_builder_;
|
||||
};
|
||||
|
||||
/// \brief A BufferBuilder for building a buffer containing a bitmap
|
||||
template <>
|
||||
class TypedBufferBuilder<bool> {
|
||||
public:
|
||||
explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
|
||||
int64_t alignment = kDefaultBufferAlignment)
|
||||
: bytes_builder_(pool, alignment) {}
|
||||
|
||||
explicit TypedBufferBuilder(BufferBuilder builder)
|
||||
: bytes_builder_(std::move(builder)) {}
|
||||
|
||||
BufferBuilder* bytes_builder() { return &bytes_builder_; }
|
||||
|
||||
Status Append(bool value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(1));
|
||||
UnsafeAppend(value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_elements));
|
||||
UnsafeAppend(valid_bytes, num_elements);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Append(const int64_t num_copies, bool value) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(num_copies));
|
||||
UnsafeAppend(num_copies, value);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void UnsafeAppend(bool value) {
|
||||
bit_util::SetBitTo(mutable_data(), bit_length_, value);
|
||||
if (!value) {
|
||||
++false_count_;
|
||||
}
|
||||
++bit_length_;
|
||||
}
|
||||
|
||||
/// \brief Append bits from an array of bytes (one value per byte)
|
||||
void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
|
||||
if (num_elements == 0) return;
|
||||
int64_t i = 0;
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
|
||||
bool value = bytes[i++];
|
||||
false_count_ += !value;
|
||||
return value;
|
||||
});
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
/// \brief Append bits from a packed bitmap
|
||||
void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) {
|
||||
if (num_elements == 0) return;
|
||||
internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_);
|
||||
false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements);
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
void UnsafeAppend(const int64_t num_copies, bool value) {
|
||||
bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
|
||||
false_count_ += num_copies * !value;
|
||||
bit_length_ += num_copies;
|
||||
}
|
||||
|
||||
template <bool count_falses, typename Generator>
|
||||
void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
|
||||
if (num_elements == 0) return;
|
||||
|
||||
if (count_falses) {
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
|
||||
bool value = gen();
|
||||
false_count_ += !value;
|
||||
return value;
|
||||
});
|
||||
} else {
|
||||
internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
|
||||
std::forward<Generator>(gen));
|
||||
}
|
||||
bit_length_ += num_elements;
|
||||
}
|
||||
|
||||
Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
|
||||
const int64_t old_byte_capacity = bytes_builder_.capacity();
|
||||
ARROW_RETURN_NOT_OK(
|
||||
bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit));
|
||||
// Resize() may have chosen a larger capacity (e.g. for padding),
|
||||
// so ask it again before calling memset().
|
||||
const int64_t new_byte_capacity = bytes_builder_.capacity();
|
||||
if (new_byte_capacity > old_byte_capacity) {
|
||||
// The additional buffer space is 0-initialized for convenience,
|
||||
// so that other methods can simply bump the length.
|
||||
memset(mutable_data() + old_byte_capacity, 0,
|
||||
static_cast<size_t>(new_byte_capacity - old_byte_capacity));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Reserve(const int64_t additional_elements) {
|
||||
return Resize(
|
||||
BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements),
|
||||
false);
|
||||
}
|
||||
|
||||
Status Advance(const int64_t length) {
|
||||
ARROW_RETURN_NOT_OK(Reserve(length));
|
||||
bit_length_ += length;
|
||||
false_count_ += length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
|
||||
// set bytes_builder_.size_ == byte size of data
|
||||
bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) -
|
||||
bytes_builder_.length());
|
||||
bit_length_ = false_count_ = 0;
|
||||
return bytes_builder_.Finish(out, shrink_to_fit);
|
||||
}
|
||||
|
||||
Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
|
||||
std::shared_ptr<Buffer> out;
|
||||
ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
|
||||
return out;
|
||||
}
|
||||
|
||||
/// \brief Like Finish, but override the final buffer size
|
||||
///
|
||||
/// This is useful after writing data directly into the builder memory
|
||||
/// without calling the Append methods (basically, when using TypedBufferBuilder
|
||||
/// only for memory allocation).
|
||||
Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
|
||||
bool shrink_to_fit = true) {
|
||||
const auto final_byte_length = bit_util::BytesForBits(final_length);
|
||||
bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
|
||||
bit_length_ = false_count_ = 0;
|
||||
return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
|
||||
}
|
||||
|
||||
void Reset() {
|
||||
bytes_builder_.Reset();
|
||||
bit_length_ = false_count_ = 0;
|
||||
}
|
||||
|
||||
int64_t length() const { return bit_length_; }
|
||||
int64_t capacity() const { return bytes_builder_.capacity() * 8; }
|
||||
const uint8_t* data() const { return bytes_builder_.data(); }
|
||||
uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
|
||||
int64_t false_count() const { return false_count_; }
|
||||
|
||||
private:
|
||||
BufferBuilder bytes_builder_;
|
||||
int64_t bit_length_ = 0;
|
||||
int64_t false_count_ = 0;
|
||||
};
|
||||
|
||||
} // namespace arrow
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user