Merging PR_218 openai_rev package with new streamlit chat app
This commit is contained in:
384
venv/lib/python3.9/site-packages/pyarrow/orc.py
Normal file
384
venv/lib/python3.9/site-packages/pyarrow/orc.py
Normal file
@@ -0,0 +1,384 @@
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
|
||||
from numbers import Integral
|
||||
import warnings
|
||||
|
||||
from pyarrow.lib import Table
|
||||
import pyarrow._orc as _orc
|
||||
from pyarrow.fs import _resolve_filesystem_and_path
|
||||
|
||||
|
||||
class ORCFile:
|
||||
"""
|
||||
Reader interface for a single ORC file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str or pyarrow.NativeFile
|
||||
Readable source. For passing Python file objects or byte buffers,
|
||||
see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
|
||||
"""
|
||||
|
||||
def __init__(self, source):
|
||||
self.reader = _orc.ORCReader()
|
||||
self.reader.open(source)
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
"""The file metadata, as an arrow KeyValueMetadata"""
|
||||
return self.reader.metadata()
|
||||
|
||||
@property
|
||||
def schema(self):
|
||||
"""The file schema, as an arrow schema"""
|
||||
return self.reader.schema()
|
||||
|
||||
@property
|
||||
def nrows(self):
|
||||
"""The number of rows in the file"""
|
||||
return self.reader.nrows()
|
||||
|
||||
@property
|
||||
def nstripes(self):
|
||||
"""The number of stripes in the file"""
|
||||
return self.reader.nstripes()
|
||||
|
||||
@property
|
||||
def file_version(self):
|
||||
"""Format version of the ORC file, must be 0.11 or 0.12"""
|
||||
return self.reader.file_version()
|
||||
|
||||
@property
|
||||
def software_version(self):
|
||||
"""Software instance and version that wrote this file"""
|
||||
return self.reader.software_version()
|
||||
|
||||
@property
|
||||
def compression(self):
|
||||
"""Compression codec of the file"""
|
||||
return self.reader.compression()
|
||||
|
||||
@property
|
||||
def compression_size(self):
|
||||
"""Number of bytes to buffer for the compression codec in the file"""
|
||||
return self.reader.compression_size()
|
||||
|
||||
@property
|
||||
def writer(self):
|
||||
"""Name of the writer that wrote this file.
|
||||
If the writer is unknown then its Writer ID
|
||||
(a number) is returned"""
|
||||
return self.reader.writer()
|
||||
|
||||
@property
|
||||
def writer_version(self):
|
||||
"""Version of the writer"""
|
||||
return self.reader.writer_version()
|
||||
|
||||
@property
|
||||
def row_index_stride(self):
|
||||
"""Number of rows per an entry in the row index or 0
|
||||
if there is no row index"""
|
||||
return self.reader.row_index_stride()
|
||||
|
||||
@property
|
||||
def nstripe_statistics(self):
|
||||
"""Number of stripe statistics"""
|
||||
return self.reader.nstripe_statistics()
|
||||
|
||||
@property
|
||||
def content_length(self):
|
||||
"""Length of the data stripes in the file in bytes"""
|
||||
return self.reader.content_length()
|
||||
|
||||
@property
|
||||
def stripe_statistics_length(self):
|
||||
"""The number of compressed bytes in the file stripe statistics"""
|
||||
return self.reader.stripe_statistics_length()
|
||||
|
||||
@property
|
||||
def file_footer_length(self):
|
||||
"""The number of compressed bytes in the file footer"""
|
||||
return self.reader.file_footer_length()
|
||||
|
||||
@property
|
||||
def file_postscript_length(self):
|
||||
"""The number of bytes in the file postscript"""
|
||||
return self.reader.file_postscript_length()
|
||||
|
||||
@property
|
||||
def file_length(self):
|
||||
"""The number of bytes in the file"""
|
||||
return self.reader.file_length()
|
||||
|
||||
def _select_names(self, columns=None):
|
||||
if columns is None:
|
||||
return None
|
||||
|
||||
schema = self.schema
|
||||
names = []
|
||||
for col in columns:
|
||||
if isinstance(col, Integral):
|
||||
col = int(col)
|
||||
if 0 <= col < len(schema):
|
||||
col = schema[col].name
|
||||
names.append(col)
|
||||
else:
|
||||
raise ValueError("Column indices must be in 0 <= ind < %d,"
|
||||
" got %d" % (len(schema), col))
|
||||
else:
|
||||
return columns
|
||||
|
||||
return names
|
||||
|
||||
def read_stripe(self, n, columns=None):
|
||||
"""Read a single stripe from the file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : int
|
||||
The stripe index
|
||||
columns : list
|
||||
If not None, only these columns will be read from the stripe. A
|
||||
column name may be a prefix of a nested field, e.g. 'a' will select
|
||||
'a.b', 'a.c', and 'a.d.e'
|
||||
|
||||
Returns
|
||||
-------
|
||||
pyarrow.RecordBatch
|
||||
Content of the stripe as a RecordBatch.
|
||||
"""
|
||||
columns = self._select_names(columns)
|
||||
return self.reader.read_stripe(n, columns=columns)
|
||||
|
||||
def read(self, columns=None):
|
||||
"""Read the whole file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
columns : list
|
||||
If not None, only these columns will be read from the file. A
|
||||
column name may be a prefix of a nested field, e.g. 'a' will select
|
||||
'a.b', 'a.c', and 'a.d.e'. Output always follows the
|
||||
ordering of the file and not the `columns` list.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pyarrow.Table
|
||||
Content of the file as a Table.
|
||||
"""
|
||||
columns = self._select_names(columns)
|
||||
return self.reader.read(columns=columns)
|
||||
|
||||
|
||||
_orc_writer_args_docs = """file_version : {"0.11", "0.12"}, default "0.12"
|
||||
Determine which ORC file version to use.
|
||||
`Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_
|
||||
is the older version
|
||||
while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_
|
||||
is the newer one.
|
||||
batch_size : int, default 1024
|
||||
Number of rows the ORC writer writes at a time.
|
||||
stripe_size : int, default 64 * 1024 * 1024
|
||||
Size of each ORC stripe in bytes.
|
||||
compression : string, default 'uncompressed'
|
||||
The compression codec.
|
||||
Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'}
|
||||
Note that LZ0 is currently not supported.
|
||||
compression_block_size : int, default 64 * 1024
|
||||
Size of each compression block in bytes.
|
||||
compression_strategy : string, default 'speed'
|
||||
The compression strategy i.e. speed vs size reduction.
|
||||
Valid values: {'SPEED', 'COMPRESSION'}
|
||||
row_index_stride : int, default 10000
|
||||
The row index stride i.e. the number of rows per
|
||||
an entry in the row index.
|
||||
padding_tolerance : double, default 0.0
|
||||
The padding tolerance.
|
||||
dictionary_key_size_threshold : double, default 0.0
|
||||
The dictionary key size threshold. 0 to disable dictionary encoding.
|
||||
1 to always enable dictionary encoding.
|
||||
bloom_filter_columns : None, set-like or list-like, default None
|
||||
Columns that use the bloom filter.
|
||||
bloom_filter_fpp : double, default 0.05
|
||||
Upper limit of the false-positive rate of the bloom filter.
|
||||
"""
|
||||
|
||||
|
||||
class ORCWriter:
|
||||
__doc__ = """
|
||||
Writer interface for a single ORC file
|
||||
|
||||
Parameters
|
||||
----------
|
||||
where : str or pyarrow.io.NativeFile
|
||||
Writable target. For passing Python file objects or byte buffers,
|
||||
see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
|
||||
or pyarrow.io.FixedSizeBufferWriter.
|
||||
{}
|
||||
""".format(_orc_writer_args_docs)
|
||||
|
||||
is_open = False
|
||||
|
||||
def __init__(self, where, *,
|
||||
file_version='0.12',
|
||||
batch_size=1024,
|
||||
stripe_size=64 * 1024 * 1024,
|
||||
compression='uncompressed',
|
||||
compression_block_size=65536,
|
||||
compression_strategy='speed',
|
||||
row_index_stride=10000,
|
||||
padding_tolerance=0.0,
|
||||
dictionary_key_size_threshold=0.0,
|
||||
bloom_filter_columns=None,
|
||||
bloom_filter_fpp=0.05,
|
||||
):
|
||||
self.writer = _orc.ORCWriter()
|
||||
self.writer.open(
|
||||
where,
|
||||
file_version=file_version,
|
||||
batch_size=batch_size,
|
||||
stripe_size=stripe_size,
|
||||
compression=compression,
|
||||
compression_block_size=compression_block_size,
|
||||
compression_strategy=compression_strategy,
|
||||
row_index_stride=row_index_stride,
|
||||
padding_tolerance=padding_tolerance,
|
||||
dictionary_key_size_threshold=dictionary_key_size_threshold,
|
||||
bloom_filter_columns=bloom_filter_columns,
|
||||
bloom_filter_fpp=bloom_filter_fpp
|
||||
)
|
||||
self.is_open = True
|
||||
|
||||
def __del__(self):
|
||||
self.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args, **kwargs):
|
||||
self.close()
|
||||
|
||||
def write(self, table):
|
||||
"""
|
||||
Write the table into an ORC file. The schema of the table must
|
||||
be equal to the schema used when opening the ORC file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : pyarrow.Table
|
||||
The table to be written into the ORC file
|
||||
"""
|
||||
assert self.is_open
|
||||
self.writer.write(table)
|
||||
|
||||
def close(self):
|
||||
"""
|
||||
Close the ORC file
|
||||
"""
|
||||
if self.is_open:
|
||||
self.writer.close()
|
||||
self.is_open = False
|
||||
|
||||
|
||||
def read_table(source, columns=None, filesystem=None):
|
||||
filesystem, path = _resolve_filesystem_and_path(source, filesystem)
|
||||
if filesystem is not None:
|
||||
source = filesystem.open_input_file(path)
|
||||
|
||||
if columns is not None and len(columns) == 0:
|
||||
result = ORCFile(source).read().select(columns)
|
||||
else:
|
||||
result = ORCFile(source).read(columns=columns)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
read_table.__doc__ = """
|
||||
Read a Table from an ORC file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source : str, pyarrow.NativeFile, or file-like object
|
||||
If a string passed, can be a single file name. For file-like objects,
|
||||
only read a single file. Use pyarrow.BufferReader to read a file
|
||||
contained in a bytes or buffer-like object.
|
||||
columns : list
|
||||
If not None, only these columns will be read from the file. A column
|
||||
name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
|
||||
'a.c', and 'a.d.e'. Output always follows the ordering of the file and
|
||||
not the `columns` list. If empty, no columns will be read. Note
|
||||
that the table will still have the correct num_rows set despite having
|
||||
no columns.
|
||||
filesystem : FileSystem, default None
|
||||
If nothing passed, will be inferred based on path.
|
||||
Path will try to be found in the local on-disk filesystem otherwise
|
||||
it will be parsed as an URI to determine the filesystem.
|
||||
"""
|
||||
|
||||
|
||||
def write_table(table, where, *,
|
||||
file_version='0.12',
|
||||
batch_size=1024,
|
||||
stripe_size=64 * 1024 * 1024,
|
||||
compression='uncompressed',
|
||||
compression_block_size=65536,
|
||||
compression_strategy='speed',
|
||||
row_index_stride=10000,
|
||||
padding_tolerance=0.0,
|
||||
dictionary_key_size_threshold=0.0,
|
||||
bloom_filter_columns=None,
|
||||
bloom_filter_fpp=0.05):
|
||||
if isinstance(where, Table):
|
||||
warnings.warn(
|
||||
"The order of the arguments has changed. Pass as "
|
||||
"'write_table(table, where)' instead. The old order will raise "
|
||||
"an error in the future.", FutureWarning, stacklevel=2
|
||||
)
|
||||
table, where = where, table
|
||||
with ORCWriter(
|
||||
where,
|
||||
file_version=file_version,
|
||||
batch_size=batch_size,
|
||||
stripe_size=stripe_size,
|
||||
compression=compression,
|
||||
compression_block_size=compression_block_size,
|
||||
compression_strategy=compression_strategy,
|
||||
row_index_stride=row_index_stride,
|
||||
padding_tolerance=padding_tolerance,
|
||||
dictionary_key_size_threshold=dictionary_key_size_threshold,
|
||||
bloom_filter_columns=bloom_filter_columns,
|
||||
bloom_filter_fpp=bloom_filter_fpp
|
||||
) as writer:
|
||||
writer.write(table)
|
||||
|
||||
|
||||
write_table.__doc__ = """
|
||||
Write a table into an ORC file.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
table : pyarrow.lib.Table
|
||||
The table to be written into the ORC file
|
||||
where : str or pyarrow.io.NativeFile
|
||||
Writable target. For passing Python file objects or byte buffers,
|
||||
see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
|
||||
or pyarrow.io.FixedSizeBufferWriter.
|
||||
{}
|
||||
""".format(_orc_writer_args_docs)
|
||||
Reference in New Issue
Block a user