Skip to content

Series#

Each column in a Table is a Series. Series expose methods which invoke high-performance kernels for manipulation of a column of data.

Series #

Series()

A Daft Series is an array of data of a single type, and is usually a column in a DataFrame.

Methods:

Name Description
from_arrow

Construct a Series from an pyarrow array or chunked array.

from_numpy

Construct a Series from a NumPy ndarray.

from_pandas

Construct a Series from a pandas Series.

from_pylist

Construct a Series from a Python list.

to_arrow

Convert this Series to an pyarrow array.

to_pylist

Convert this Series to a Python list.

Source code in daft/series.py
55
56
def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Series via __init__ ")

from_arrow #

from_arrow(
    array: Array | ChunkedArray, name: str = "arrow_series"
) -> Series

Construct a Series from an pyarrow array or chunked array.

Parameters:

Name Type Description Default
array Array | ChunkedArray

The pyarrow (chunked) array whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'arrow_series'
Source code in daft/series.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@staticmethod
def from_arrow(array: pa.Array | pa.ChunkedArray, name: str = "arrow_series") -> Series:
    """Construct a Series from an pyarrow array or chunked array.

    Args:
        array: The pyarrow (chunked) array whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
    """
    _ensure_registered_super_ext_type()
    if DataType.from_arrow_type(array.type) == DataType.python():
        # If the Arrow type is not natively supported, go through the Python list path.
        return Series.from_pylist(array.to_pylist(), name=name, pyobj="force")
    elif isinstance(array, pa.Array):
        array = ensure_array(array)
        if isinstance(array.type, getattr(pa, "FixedShapeTensorType", ())):
            series = Series.from_arrow(array.storage, name=name)
            return series.cast(DataType.from_arrow_type(array.type))
        else:
            pys = PySeries.from_arrow(name, array)
            return Series._from_pyseries(pys)
    elif isinstance(array, pa.ChunkedArray):
        array = ensure_chunked_array(array)
        arr_type = array.type
        if isinstance(arr_type, pa.BaseExtensionType):
            combined_storage_array = array.cast(arr_type.storage_type).combine_chunks()
            combined_array = arr_type.wrap_array(combined_storage_array)
        else:
            combined_array = array.combine_chunks()
        return Series.from_arrow(combined_array)
    else:
        raise TypeError(f"expected either PyArrow Array or Chunked Array, got {type(array)}")

from_numpy #

from_numpy(
    data: ndarray[Any, Any], name: str = "numpy_series"
) -> Series

Construct a Series from a NumPy ndarray.

If the provided NumPy ndarray is 1-dimensional, Daft will attempt to store the ndarray in a pyarrow Array. If the ndarray has more than 1 dimension OR storing the 1D array in Arrow failed, Daft will store the ndarray data as a Python list of NumPy ndarrays.

Parameters:

Name Type Description Default
data ndarray[Any, Any]

The NumPy ndarray whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'numpy_series'
Source code in daft/series.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
@classmethod
def from_numpy(cls, data: np.ndarray[Any, Any], name: str = "numpy_series") -> Series:
    """Construct a Series from a NumPy ndarray.

    If the provided NumPy ndarray is 1-dimensional, Daft will attempt to store the ndarray
    in a pyarrow Array. If the ndarray has more than 1 dimension OR storing the 1D array in Arrow failed,
    Daft will store the ndarray data as a Python list of NumPy ndarrays.

    Args:
        data: The NumPy ndarray whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
    """
    if not isinstance(data, np.ndarray):
        raise TypeError(f"Expected a NumPy ndarray, got {type(data)}")
    if data.ndim <= 1:
        try:
            arrow_array = pa.array(data)
        except pa.ArrowInvalid:
            pass
        else:
            return cls.from_arrow(arrow_array, name=name)
    # TODO(Clark): Represent the tensor series with an Arrow extension type in order
    # to keep the series data contiguous.
    list_ndarray = [np.asarray(item) for item in data]
    return cls.from_pylist(list_ndarray, name=name, pyobj="allow")

from_pandas #

from_pandas(
    data: Series[Any], name: str = "pd_series"
) -> Series

Construct a Series from a pandas Series.

This will first try to convert the series into a pyarrow array, then will fall back to converting the series to a NumPy ndarray and going through that construction path, and will finally fall back to converting the series to a Python list and going through that path.

Parameters:

Name Type Description Default
data Series[Any]

The pandas Series whose data we wish to put in the Daft Series.

required
name str

The name associated with the Series; this is usually the column name.

'pd_series'
Source code in daft/series.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
@classmethod
def from_pandas(cls, data: pd.Series[Any], name: str = "pd_series") -> Series:
    """Construct a Series from a pandas Series.

    This will first try to convert the series into a pyarrow array, then will fall
    back to converting the series to a NumPy ndarray and going through that construction path,
    and will finally fall back to converting the series to a Python list and going through that
    path.

    Args:
        data: The pandas Series whose data we wish to put in the Daft Series.
        name: The name associated with the Series; this is usually the column name.
    """
    if not isinstance(data, pd.Series):
        raise TypeError(f"expected a pandas Series, got {type(data)}")
    # First, try Arrow path.
    try:
        arrow_arr = pa.Array.from_pandas(data)
    except pa.ArrowInvalid:
        pass
    else:
        return cls.from_arrow(arrow_arr, name=name)
    # Second, fall back to NumPy path. Note that .from_numpy() does _not_ fall back to
    # the pylist representation for 1D arrays and instead raises an error that we can catch.
    # We do the pylist representation fallback ourselves since the pd.Series.to_list()
    # preserves more type information for types that are not natively representable in Python.
    try:
        ndarray = data.to_numpy()
        return cls.from_numpy(ndarray, name=name)
    except Exception:
        pass
    # Finally, fall back to pylist path.
    # NOTE: For element types that don't have a native Python representation,
    # a Pandas scalar object will be returned.
    return cls.from_pylist(data.to_list(), name=name, pyobj="force")

from_pylist #

from_pylist(
    data: list[Any],
    name: str = "list_series",
    pyobj: Literal["allow", "disallow", "force"] = "allow",
) -> Series

Construct a Series from a Python list.

The resulting type depends on the setting of pyobjects
  • "allow": Arrow-backed types if possible, else PyObject;
  • "disallow": Arrow-backed types only, raising error if not convertible;
  • "force": Store as PyObject types.

Parameters:

Name Type Description Default
data list[Any]

The Python list whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'list_series'
pyobj Literal['allow', 'disallow', 'force']

Whether we want to "allow" coercion to Arrow types, "disallow" falling back to Python type representation, or "force" the data to only have a Python type representation. Default is "allow".

'allow'
Source code in daft/series.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
@staticmethod
def from_pylist(
    data: list[Any],
    name: str = "list_series",
    pyobj: Literal["allow", "disallow", "force"] = "allow",
) -> Series:
    """Construct a Series from a Python list.

    The resulting type depends on the setting of pyobjects:
        - ``"allow"``: Arrow-backed types if possible, else PyObject;
        - ``"disallow"``: Arrow-backed types only, raising error if not convertible;
        - ``"force"``: Store as PyObject types.

    Args:
        data: The Python list whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
        pyobj: Whether we want to ``"allow"`` coercion to Arrow types, ``"disallow"``
            falling back to Python type representation, or ``"force"`` the data to only
            have a Python type representation. Default is ``"allow"``.
    """
    if not isinstance(data, list):
        raise TypeError(f"expected a python list, got {type(data)}")

    if pyobj not in {"allow", "disallow", "force"}:
        raise ValueError(f"pyobj: expected either 'allow', 'disallow', or 'force', but got {pyobj})")

    if pyobj == "force":
        pys = PySeries.from_pylist(name, data, pyobj=pyobj)
        return Series._from_pyseries(pys)

    try:
        # Workaround: wrap list of np.datetime64 in an np.array
        #   - https://github.com/apache/arrow/issues/40580
        #   - https://github.com/Eventual-Inc/Daft/issues/3826
        if data and np.module_available() and isinstance(data[0], np.datetime64):  # type: ignore[attr-defined]
            np_arr = np.array(data)
            arrow_array = pa.array(np_arr)
        else:
            arrow_array = pa.array(data)
        return Series.from_arrow(arrow_array, name=name)
    except pa.lib.ArrowInvalid:
        if pyobj == "disallow":
            raise
        pys = PySeries.from_pylist(name, data, pyobj=pyobj)
        return Series._from_pyseries(pys)

to_arrow #

to_arrow() -> Array

Convert this Series to an pyarrow array.

Source code in daft/series.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def to_arrow(self) -> pa.Array:
    """Convert this Series to an pyarrow array."""
    _ensure_registered_super_ext_type()

    dtype = self.datatype()
    arrow_arr = self._series.to_arrow()

    # Special-case for PyArrow FixedShapeTensor if it is supported by the version of PyArrow
    # TODO: Push this down into self._series.to_arrow()?
    if dtype.is_fixed_shape_tensor() and pyarrow_supports_fixed_shape_tensor():
        pyarrow_dtype = dtype.to_arrow_dtype()
        arrow_series = self._series.to_arrow()
        return pa.ExtensionArray.from_storage(pyarrow_dtype, arrow_series.storage)

    return arrow_arr

to_pylist #

to_pylist() -> list[Any]

Convert this Series to a Python list.

Source code in daft/series.py
274
275
276
277
278
279
280
281
def to_pylist(self) -> list[Any]:
    """Convert this Series to a Python list."""
    if self.datatype().is_python():
        return self._series.to_pylist()
    elif self.datatype()._should_cast_to_python():
        return self._series.cast(DataType.python()._dtype).to_pylist()
    else:
        return self._series.to_arrow().to_pylist()