datasource.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2025, NVIDIA CORPORATION.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cudf/io/types.hpp>
20 #include <cudf/utilities/error.hpp>
21 #include <cudf/utilities/export.hpp>
22 #include <cudf/utilities/span.hpp>
23 
24 #include <rmm/cuda_stream_view.hpp>
25 
26 #include <future>
27 #include <memory>
28 
29 namespace CUDF_EXPORT cudf {
31 namespace io {
32 
42 class datasource {
43  public:
44  template <typename Container>
45  class owning_buffer; // forward declaration
51  class buffer {
52  public:
58  [[nodiscard]] virtual size_t size() const = 0;
59 
65  [[nodiscard]] virtual uint8_t const* data() const = 0;
66 
70  virtual ~buffer() = default;
71 
79  template <typename Container>
80  static std::unique_ptr<buffer> create(Container&& data_owner)
81  {
82  return std::make_unique<owning_buffer<Container>>(std::forward<Container>(data_owner));
83  }
84  };
85 
101  static std::unique_ptr<datasource> create(std::string const& filepath,
102  size_t offset = 0,
103  size_t max_size_estimate = 0);
104 
113  static std::unique_ptr<datasource> create(host_buffer const& buffer);
114 
121  static std::unique_ptr<datasource> create(cudf::host_span<std::byte const> buffer);
122 
129  static std::unique_ptr<datasource> create(cudf::device_span<std::byte const> buffer);
130 
137  static std::unique_ptr<datasource> create(datasource* source);
138 
145  template <typename T>
146  static std::vector<std::unique_ptr<datasource>> create(std::vector<T> const& args)
147  {
148  std::vector<std::unique_ptr<datasource>> sources;
149  sources.reserve(args.size());
150  std::transform(args.cbegin(), args.cend(), std::back_inserter(sources), [](auto const& arg) {
151  return datasource::create(arg);
152  });
153  return sources;
154  }
155 
159  virtual ~datasource() = default;
160 
169  virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
170 
183  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
184  size_t size);
185 
195  virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
196 
211  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
212 
225  [[nodiscard]] virtual bool supports_device_read() const { return false; }
226 
233  [[nodiscard]] virtual bool is_device_read_preferred(size_t size) const
234  {
235  return supports_device_read();
236  }
237 
254  virtual std::unique_ptr<datasource::buffer> device_read(size_t offset,
255  size_t size,
256  rmm::cuda_stream_view stream)
257  {
258  CUDF_FAIL("datasource classes that support device_read must override it.");
259  }
260 
278  virtual size_t device_read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream)
279  {
280  CUDF_FAIL("datasource classes that support device_read must override it.");
281  }
282 
306  virtual std::future<size_t> device_read_async(size_t offset,
307  size_t size,
308  uint8_t* dst,
309  rmm::cuda_stream_view stream)
310  {
311  CUDF_FAIL("datasource classes that support device_read_async must override it.");
312  }
313 
319  [[nodiscard]] virtual size_t size() const = 0;
320 
326  [[nodiscard]] virtual bool is_empty() const { return size() == 0; }
327 
331  class non_owning_buffer : public buffer {
332  public:
333  non_owning_buffer() = default;
334 
341  non_owning_buffer(uint8_t const* data, size_t size) : _data(data), _size(size) {}
342 
348  [[nodiscard]] size_t size() const override { return _size; }
349 
355  [[nodiscard]] uint8_t const* data() const override { return _data; }
356 
357  private:
358  uint8_t const* _data{nullptr};
359  size_t _size{0};
360  };
361 
369  template <typename Container>
370  class owning_buffer : public buffer {
371  public:
372  // Require that the argument passed to the constructor be an rvalue (Container&& being an rvalue
373  // reference).
374  static_assert(std::is_rvalue_reference_v<Container&&>,
375  "The container argument passed to the constructor must be an rvalue.");
376 
383  owning_buffer(Container&& moved_data_owner)
384  : _data(std::move(moved_data_owner)), _data_ptr(_data.data()), _size(_data.size())
385  {
386  }
387 
397  owning_buffer(Container&& moved_data_owner, uint8_t const* data_ptr, size_t size)
398  : _data(std::move(moved_data_owner)), _data_ptr(data_ptr), _size(size)
399  {
400  }
401 
407  [[nodiscard]] size_t size() const override { return _size; }
408 
414  [[nodiscard]] uint8_t const* data() const override
415  {
416  return static_cast<uint8_t const*>(_data_ptr);
417  }
418 
419  private:
420  Container _data;
421  void const* _data_ptr;
422  size_t _size;
423  };
424 };
425  // end of group
427 } // namespace io
428 } // namespace CUDF_EXPORT cudf
Interface class for buffers that the datasource returns to the caller.
Definition: datasource.hpp:51
virtual ~buffer()=default
Base class destructor.
static std::unique_ptr< buffer > create(Container &&data_owner)
Factory to construct a datasource buffer object from a container.
Definition: datasource.hpp:80
virtual size_t size() const =0
Returns the buffer size in bytes.
virtual uint8_t const * data() const =0
Returns the address of the data in the buffer.
Implementation for non owning buffer where datasource holds buffer until destruction.
Definition: datasource.hpp:331
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:348
uint8_t const * data() const override
Returns the pointer to the buffer.
Definition: datasource.hpp:355
non_owning_buffer(uint8_t const *data, size_t size)
Construct a new non owning buffer object.
Definition: datasource.hpp:341
Derived implementation of buffer that owns the data.
Definition: datasource.hpp:370
owning_buffer(Container &&moved_data_owner)
Moves the input container into the newly created object.
Definition: datasource.hpp:383
owning_buffer(Container &&moved_data_owner, uint8_t const *data_ptr, size_t size)
Moves the input container into the newly created object, and exposes a subspan of the buffer.
Definition: datasource.hpp:397
size_t size() const override
Returns the size of the buffer.
Definition: datasource.hpp:407
uint8_t const * data() const override
Returns the pointer to the data in the buffer.
Definition: datasource.hpp:414
Interface class for providing input data to the readers.
Definition: datasource.hpp:42
virtual ~datasource()=default
Base class destructor.
static std::vector< std::unique_ptr< datasource > > create(std::vector< T > const &args)
Creates a vector of datasources, one per element in the input vector.
Definition: datasource.hpp:146
virtual bool supports_device_read() const
Whether or not this source supports reading directly into device memory.
Definition: datasource.hpp:225
static std::unique_ptr< datasource > create(datasource *source)
Creates a source from an user implemented datasource object.
virtual std::future< std::unique_ptr< datasource::buffer > > host_read_async(size_t offset, size_t size)
Asynchronously reads a specified portion of data from the datasource.
virtual size_t device_read(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:278
virtual bool is_device_read_preferred(size_t size) const
Estimates whether a direct device read would be more optimal for the given size.
Definition: datasource.hpp:233
static std::unique_ptr< datasource > create(cudf::device_span< std::byte const > buffer)
Creates a source from a device memory buffer.
virtual std::future< size_t > device_read_async(size_t offset, size_t size, uint8_t *dst, rmm::cuda_stream_view stream)
Asynchronously reads a selected range into a preallocated device buffer.
Definition: datasource.hpp:306
virtual bool is_empty() const
Returns whether the source contains any data.
Definition: datasource.hpp:326
virtual std::future< size_t > host_read_async(size_t offset, size_t size, uint8_t *dst)
Asynchronously reads data from the source into the provided host memory buffer.
virtual size_t host_read(size_t offset, size_t size, uint8_t *dst)=0
Reads a selected range into a preallocated buffer.
static std::unique_ptr< datasource > create(host_buffer const &buffer)
Creates a source from a host memory buffer.
virtual std::unique_ptr< datasource::buffer > device_read(size_t offset, size_t size, rmm::cuda_stream_view stream)
Returns a device buffer with a subset of data from the source.
Definition: datasource.hpp:254
virtual size_t size() const =0
Returns the size of the data in the source.
virtual std::unique_ptr< datasource::buffer > host_read(size_t offset, size_t size)=0
Returns a buffer with a subset of data from the source.
static std::unique_ptr< datasource > create(cudf::host_span< std::byte const > buffer)
Creates a source from a host memory buffer.
static std::unique_ptr< datasource > create(std::string const &filepath, size_t offset=0, size_t max_size_estimate=0)
Creates a source from a file path.
std::unique_ptr< column > transform(std::vector< column_view > const &inputs, std::string const &transform_udf, data_type output_type, bool is_ptx, rmm::cuda_stream_view stream=cudf::get_default_stream(), rmm::device_async_resource_ref mr=cudf::get_current_device_resource_ref())
Creates a new column by applying a transform function against every element of the input columns.
#define CUDF_FAIL(...)
Indicates that an erroneous code path has been taken.
Definition: error.hpp:217
cuDF-IO API type definitions
cuDF interfaces
Definition: host_udf.hpp:37
APIs for spans.
Device version of C++20 std::span with reduced feature set.
Definition: span.hpp:355
C++20 std::span with reduced feature set.
Definition: span.hpp:194
Non-owning view of a host memory buffer.
Definition: io/types.hpp:315