Add dump arrow schema and batchrecord (#1103)

WolfDan · web-flow · commit 8eeda8305222 · 2025-07-30T09:03:41.000-04:00
* add dump arrow schema and batchrecord

* docs and clippy

* add doc type

* add type spec for keyword params
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -33,6 +33,7 @@ defmodule Explorer.Backend.DataFrame do
   @type query_frame :: Explorer.Backend.QueryFrame.t()
   @type lazy_series :: Explorer.Backend.LazySeries.t()
 
+  @type compact_level :: option(:oldest | :newest)
   @type compression :: {algorithm :: option(atom()), level :: option(integer())}
   @type columns_for_io :: list(column_name()) | list(pos_integer()) | nil
 
@@ -122,6 +123,9 @@ defmodule Explorer.Backend.DataFrame do
               contents :: binary(),
               columns :: columns_for_io()
             ) :: io_result(df)
+  @callback dump_ipc_schema(df, compact_level()) :: io_result(binary())
+  @callback dump_ipc_record_batch(df, integer(), compression(), compact_level()) ::
+              io_result(list(binary()))
 
   # IO: IPC Stream
   @callback from_ipc_stream(
diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -269,6 +269,35 @@ defmodule Explorer.DataFrame do
           dtypes: %{String.t() => Explorer.Series.dtype()}
         }
 
+  @typedoc """
+  Represents the max chunk size for a given batch record.
+  """
+  @type max_chunk_size :: nil | integer()
+
+  @typedoc """
+  Represents the compression algorithm to use when writing files.
+  """
+  @type compression :: nil | :zstd | :lz4
+
+  @typedoc """
+  Represents the compact level used by the backend.
+  """
+  @type compact_level :: nil | :oldest | :newest
+
+  @typedoc """
+  Represents function keyword options
+  """
+  @type dump_ipc_schema_options :: [compact_level: compact_level()]
+
+  @typedoc """
+  Represents function keyword options
+  """
+  @type dump_ipc_record_batch_options :: [
+          max_chunk_size: max_chunk_size(),
+          compression: compression(),
+          compact_level: compact_level()
+        ]
+
   @default_infer_schema_length 1000
   @default_sample_nrows 5
   @integer_types Explorer.Shared.integer_types()
@@ -1221,6 +1250,87 @@ defmodule Explorer.DataFrame do
     end
   end
 
+  @doc """
+  Writes a dataframe schema to a binary representation of an IPC schema message.
+
+  ## Options
+
+    * `:compact_level` - The compact level used by the backend.
+      For the Polars backend it indicates to use view types on newest
+      or use non-view types on oldest.
+      Supported options are:
+
+        * `nil` (oldest, default)
+        * `:oldest`
+        * `:newest`.
+
+  """
+  @doc type: :io
+  @spec dump_ipc_schema(df :: DataFrame.t(), opts :: dump_ipc_schema_options()) ::
+          {:ok, binary()} | {:error, Exception.t()}
+  def dump_ipc_schema(df, opts \\ []) do
+    opts = Keyword.validate!(opts, compact_level: nil)
+    compact_level = ipc_compact_level(opts[:compact_level])
+
+    Shared.apply_dataframe(df, :dump_ipc_schema, [compact_level], false)
+  end
+
+  @doc """
+  Writes a dataframe to a list of binary, each binary is a representation of a chunked record batch.
+
+  ## Options
+    * `:max_chunk_size` - The max size of each binary.
+      Supported options are:
+
+        * `nil` (10mb, default)
+        * a value in bytes
+
+    * `:compression` - The compression algorithm to use when writing files.
+      Supported options are:
+
+        * `nil` (uncompressed, default)
+        * `:zstd`
+        * `:lz4`.
+
+    * `:compact_level` - The compact level used by the backend.
+      For the Polars backend it indicates to use view types on newest
+      or use non-view types on oldest.
+      Supported options are:
+
+        * `nil` (oldest, default)
+        * `:oldest`
+        * `:newest`.
+
+  """
+  @doc type: :io
+  @spec dump_ipc_record_batch(df :: DataFrame.t(), opts :: dump_ipc_record_batch_options()) ::
+          {:ok, list(binary())} | {:error, Exception.t()}
+  def dump_ipc_record_batch(df, opts \\ []) do
+    opts = Keyword.validate!(opts, max_chunk_size: nil, compression: nil, compact_level: nil)
+    max_chunck_size = ipc_max_chunk_size(opts[:max_chunk_size])
+    compression = ipc_compression(opts[:compression])
+    compact_level = ipc_compact_level(opts[:compact_level])
+
+    Shared.apply_dataframe(
+      df,
+      :dump_ipc_record_batch,
+      [max_chunck_size, compression, compact_level],
+      false
+    )
+  end
+
+  defp ipc_max_chunk_size(nil), do: nil
+  defp ipc_max_chunk_size(byte_size) when is_integer(byte_size), do: byte_size
+
+  defp ipc_max_chunk_size(other),
+    do: raise(ArgumentError, "unsupported :max_chunk_size #{inspect(other)}")
+
+  defp ipc_compact_level(nil), do: nil
+  defp ipc_compact_level(level) when level in ~w(oldest newest)a, do: level
+
+  defp ipc_compact_level(other),
+    do: raise(ArgumentError, "unsupported :compact_level #{inspect(other)}")
+
   @doc """
   Reads a binary representing an IPC file into a dataframe.
 
diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -447,6 +447,32 @@ defmodule Explorer.PolarsBackend.DataFrame do
     end
   end
 
+  @impl true
+  def dump_ipc_schema(%DataFrame{data: df}, compact_level) do
+    case Native.df_dump_ipc_schema(df, maybe_atom_to_string(compact_level)) do
+      {:ok, string} -> {:ok, string}
+      {:error, error} -> {:error, RuntimeError.exception(error)}
+    end
+  end
+
+  @impl true
+  def dump_ipc_record_batch(
+        %DataFrame{data: df},
+        max_chunk_size,
+        {compression, _level},
+        compact_level
+      ) do
+    case Native.df_dump_ipc_record_batch(
+           df,
+           max_chunk_size,
+           maybe_atom_to_string(compression),
+           maybe_atom_to_string(compact_level)
+         ) do
+      {:ok, list} -> {:ok, list}
+      {:error, error} -> {:error, RuntimeError.exception(error)}
+    end
+  end
+
   @impl true
   def from_ipc_stream(%module{} = entry, columns) when module in [S3.Entry, HTTP.Entry] do
     path = Shared.build_path_for_entry(entry)
diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
@@ -642,6 +642,8 @@ defmodule Explorer.PolarsBackend.LazyFrame do
     dump_csv: 4,
     dump_ipc: 2,
     dump_ipc_stream: 2,
+    dump_ipc_schema: 2,
+    dump_ipc_record_batch: 4,
     dump_ndjson: 1,
     dump_parquet: 2,
     mask: 2,
diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
@@ -81,6 +81,8 @@ defmodule Explorer.PolarsBackend.Native do
   def df_dump_parquet(_df, _compression), do: err()
   def df_dump_ipc(_df, _compression), do: err()
   def df_dump_ipc_stream(_df, _compression), do: err()
+  def df_dump_ipc_schema(_df, _compact_level), do: err()
+  def df_dump_ipc_record_batch(_df, _max_chunk_size, _compression, _compact_level), do: err()
 
   def df_from_csv(
         _filename,
diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs
@@ -8,12 +8,18 @@
 //
 // Today we have the following formats: CSV, NDJSON, Parquet, Apache Arrow and Apache Arrow Stream.
 //
+use polars::frame::chunk_df_for_writing;
 use polars::prelude::*;
+use polars::{io::schema_to_arrow_checked, prelude::CompatLevel};
+use polars_arrow::io::ipc::write::{
+    default_ipc_fields, encode_record_batch, schema_to_bytes, EncodedData, WriteOptions,
+};
 use std::num::NonZeroUsize;
 
 use rustler::{Binary, Env, NewBinary};
 use std::fs::File;
-use std::io::{BufReader, BufWriter, Cursor};
+use std::io::{BufReader, BufWriter, Cursor, Write};
+use std::sync::Arc;
 
 use crate::datatypes::{ExParquetCompression, ExQuoteStyle, ExS3Entry, ExSeriesDtype};
 use crate::{ExDataFrame, ExplorerError};
@@ -443,6 +449,160 @@ fn decode_ipc_compression(compression: &str) -> Result<IpcCompression, ExplorerE
     }
 }
 
+fn decode_compact_level(compact_level: &str) -> Result<CompatLevel, ExplorerError> {
+    match compact_level {
+        "oldest" => Ok(CompatLevel::oldest()),
+        "newest" => Ok(CompatLevel::newest()),
+        other => Err(ExplorerError::Other(format!(
+            "the compact level {other} is not supported"
+        ))),
+    }
+}
+
+#[rustler::nif(schedule = "DirtyCpu")]
+pub fn df_dump_ipc_schema<'a>(
+    env: Env<'a>,
+    df: ExDataFrame,
+    compact_level: Option<&str>,
+) -> Result<Binary<'a>, ExplorerError> {
+    let compact_level = match compact_level {
+        Some(level) => decode_compact_level(level)?,
+        None => CompatLevel::oldest(),
+    };
+    let schema = schema_to_arrow_checked(df.schema(), compact_level, "ipc")?;
+    let ipc_fields = default_ipc_fields(schema.iter_values());
+    let schema_bytes = schema_to_bytes(&schema, &ipc_fields, None);
+    let encoded_message = EncodedData {
+        ipc_message: schema_bytes,
+        arrow_data: Vec::new(),
+    };
+
+    let mut buf = vec![];
+    write_message(&mut buf, &encoded_message)?;
+
+    let mut values_binary = NewBinary::new(env, buf.len());
+    values_binary.copy_from_slice(&buf);
+
+    Ok(values_binary.into())
+}
+
+#[rustler::nif(schedule = "DirtyCpu")]
+pub fn df_dump_ipc_record_batch<'a>(
+    env: Env<'a>,
+    df: ExDataFrame,
+    max_chunk_size: Option<usize>,
+    compression: Option<&str>,
+    compact_level: Option<&str>,
+) -> Result<Vec<Binary<'a>>, ExplorerError> {
+    let data = &mut df.clone();
+
+    let max_request_bytes = if let Some(max_chunk_size) = max_chunk_size {
+        max_chunk_size
+    } else {
+        let base: usize = 2;
+        10 * base.pow(20) // 10 MB
+    };
+    let chunk_num = data.estimated_size() / max_request_bytes + 1;
+    let chunk_size = data.fields().len() / chunk_num;
+
+    chunk_df_for_writing(data, chunk_size)?;
+
+    let compact_level = match compact_level {
+        Some(level) => decode_compact_level(level)?,
+        None => CompatLevel::oldest(),
+    };
+    let iter = data.iter_chunks(compact_level, true);
+
+    let compression = match compression {
+        Some(algo) => Some(decode_ipc_compression(algo)?.into()),
+        None => None,
+    };
+    let options = WriteOptions { compression };
+
+    let mut result = Vec::new();
+
+    for batch in iter {
+        let mut encoded_message = Default::default();
+        encode_record_batch(&batch, &options, &mut encoded_message);
+        let encoded_message = std::mem::take(&mut encoded_message);
+
+        let mut buf = vec![];
+        write_message(&mut buf, &encoded_message)?;
+        let mut values_binary = NewBinary::new(env, buf.len());
+        values_binary.copy_from_slice(&buf);
+
+        result.push(values_binary.into());
+    }
+
+    Ok(result)
+}
+
+/// Write a message's IPC data and buffers, returning metadata and buffer data lengths written
+/// code from https://github.com/pola-rs/polars/blob/main/crates/polars-arrow/src/io/ipc/write/common_sync.rs
+/// the original code is not public for external crates to use it
+pub fn write_message<W: Write>(
+    writer: &mut W,
+    encoded: &EncodedData,
+) -> PolarsResult<(usize, usize)> {
+    let arrow_data_len = encoded.arrow_data.len();
+
+    let a = 8 - 1;
+    let buffer = &encoded.ipc_message;
+    let flatbuf_size = buffer.len();
+    let prefix_size = 8;
+    let aligned_size = (flatbuf_size + prefix_size + a) & !a;
+    let padding_bytes = aligned_size - flatbuf_size - prefix_size;
+
+    write_continuation(writer, (aligned_size - prefix_size) as i32)?;
+
+    // write the flatbuf
+    if flatbuf_size > 0 {
+        writer.write_all(buffer)?;
+    }
+    // write padding
+    // aligned to a 8 byte boundary, so maximum is [u8;8]
+    const PADDING_MAX: [u8; 8] = [0u8; 8];
+    writer.write_all(&PADDING_MAX[..padding_bytes])?;
+
+    // write arrow data
+    let body_len = if arrow_data_len > 0 {
+        write_body_buffers(writer, &encoded.arrow_data)?
+    } else {
+        0
+    };
+
+    Ok((aligned_size, body_len))
+}
+
+fn write_body_buffers<W: Write>(mut writer: W, data: &[u8]) -> PolarsResult<usize> {
+    let len = data.len();
+    let pad_len = pad_to_64(data.len());
+    let total_len = len + pad_len;
+
+    // write body buffer
+    writer.write_all(data)?;
+    if pad_len > 0 {
+        writer.write_all(&vec![0u8; pad_len][..])?;
+    }
+
+    Ok(total_len)
+}
+
+/// Write a record batch to the writer, writing the message size before the message
+/// if the record batch is being written to a stream
+fn write_continuation<W: Write>(writer: &mut W, total_len: i32) -> PolarsResult<usize> {
+    const CONTINUATION_MARKER: [u8; 4] = [0xff; 4];
+    writer.write_all(&CONTINUATION_MARKER)?;
+    writer.write_all(&total_len.to_le_bytes()[..])?;
+    Ok(8)
+}
+
+/// Calculate an 8-byte boundary and return the number of bytes needed to pad to 8 bytes
+#[inline]
+fn pad_to_64(len: usize) -> usize {
+    ((len + 63) & !63) - len
+}
+
 // ============ IPC Streaming ============ //
 
 #[rustler::nif(schedule = "DirtyIo")]
diff --git a/test/explorer/data_frame/ipc_test.exs b/test/explorer/data_frame/ipc_test.exs