Skip to content

Schema#

Daft can display your DataFrame's schema without materializing it. Under the hood, it performs intelligent sampling of your data to determine the appropriate schema, and if you make any modifications to your DataFrame it can infer the resulting types based on the operation. Learn more about Schemas in Daft User Guide.

Schema #

Schema()

Methods:

Name Description
apply_hints
column_names
estimate_row_size_bytes
from_csv
from_json
from_parquet
from_pyarrow_schema

Creates a Daft Schema from a PyArrow Schema.

from_pydict
to_name_set
to_pyarrow_schema

Converts a Daft Schema to a PyArrow Schema.

union
Source code in daft/schema.py
66
67
def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Schema via __init__ ")

apply_hints #

apply_hints(hints: Schema) -> Schema
Source code in daft/schema.py
156
157
def apply_hints(self, hints: Schema) -> Schema:
    return Schema._from_pyschema(self._schema.apply_hints(hints._schema))

column_names #

column_names() -> list[str]
Source code in daft/schema.py
128
129
def column_names(self) -> list[str]:
    return list(self._schema.names())

estimate_row_size_bytes #

estimate_row_size_bytes() -> float
Source code in daft/schema.py
131
132
def estimate_row_size_bytes(self) -> float:
    return self._schema.estimate_row_size_bytes()

from_csv #

from_csv(
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@classmethod
def from_csv(
    cls,
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    return Schema._from_pyschema(
        _read_csv_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_json #

from_json(
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
@classmethod
def from_json(
    cls,
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    return Schema._from_pyschema(
        _read_json_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_parquet #

from_parquet(
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = ns(),
) -> Schema
Source code in daft/schema.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
@classmethod
def from_parquet(
    cls,
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
    return Schema._from_pyschema(
        _read_parquet_schema(
            uri=path,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
        )
    )

from_pyarrow_schema #

from_pyarrow_schema(pa_schema: Schema) -> Schema

Creates a Daft Schema from a PyArrow Schema.

Parameters:

Name Type Description Default
pa_schema Schema

PyArrow schema to convert

required

Returns:

Name Type Description
Schema Schema

Converted Daft schema

Source code in daft/schema.py
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
    """Creates a Daft Schema from a PyArrow Schema.

    Args:
        pa_schema (pa.Schema): PyArrow schema to convert

    Returns:
        Schema: Converted Daft schema
    """
    return cls._from_field_name_and_types(
        [(pa_field.name, DataType.from_arrow_type(pa_field.type)) for pa_field in pa_schema]
    )

from_pydict #

from_pydict(fields: dict[str, DataType]) -> Schema
Source code in daft/schema.py
169
170
171
@classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
    return cls._from_fields([Field.create(k, v) for k, v in fields.items()])

to_name_set #

to_name_set() -> set[str]
Source code in daft/schema.py
141
142
def to_name_set(self) -> set[str]:
    return set(self.column_names())

to_pyarrow_schema #

to_pyarrow_schema() -> Schema

Converts a Daft Schema to a PyArrow Schema.

Returns:

Type Description
Schema

pa.Schema: PyArrow schema that corresponds to the provided Daft schema

Source code in daft/schema.py
89
90
91
92
93
94
95
96
def to_pyarrow_schema(self) -> pa.Schema:
    """Converts a Daft Schema to a PyArrow Schema.

    Returns:
        pa.Schema: PyArrow schema that corresponds to the provided Daft schema
    """
    _ensure_registered_super_ext_type()
    return self._schema.to_pyarrow_schema()

union #

union(other: Schema) -> Schema
Source code in daft/schema.py
160
161
162
163
164
def union(self, other: Schema) -> Schema:
    if not isinstance(other, Schema):
        raise ValueError(f"Expected Schema, got other: {type(other)}")

    return Schema._from_pyschema(self._schema.union(other._schema))

Schema has been moved to daft.schema but is still accessible at daft.logical.schema.

Schema #

Schema()

Methods:

Name Description
apply_hints
column_names
estimate_row_size_bytes
from_csv
from_json
from_parquet
from_pyarrow_schema

Creates a Daft Schema from a PyArrow Schema.

from_pydict
to_name_set
to_pyarrow_schema

Converts a Daft Schema to a PyArrow Schema.

union
Source code in daft/schema.py
66
67
def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Schema via __init__ ")

apply_hints #

apply_hints(hints: Schema) -> Schema
Source code in daft/schema.py
156
157
def apply_hints(self, hints: Schema) -> Schema:
    return Schema._from_pyschema(self._schema.apply_hints(hints._schema))

column_names #

column_names() -> list[str]
Source code in daft/schema.py
128
129
def column_names(self) -> list[str]:
    return list(self._schema.names())

estimate_row_size_bytes #

estimate_row_size_bytes() -> float
Source code in daft/schema.py
131
132
def estimate_row_size_bytes(self) -> float:
    return self._schema.estimate_row_size_bytes()

from_csv #

from_csv(
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
@classmethod
def from_csv(
    cls,
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    return Schema._from_pyschema(
        _read_csv_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_json #

from_json(
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
@classmethod
def from_json(
    cls,
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    return Schema._from_pyschema(
        _read_json_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_parquet #

from_parquet(
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = ns(),
) -> Schema
Source code in daft/schema.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
@classmethod
def from_parquet(
    cls,
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
    return Schema._from_pyschema(
        _read_parquet_schema(
            uri=path,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
        )
    )

from_pyarrow_schema #

from_pyarrow_schema(pa_schema: Schema) -> Schema

Creates a Daft Schema from a PyArrow Schema.

Parameters:

Name Type Description Default
pa_schema Schema

PyArrow schema to convert

required

Returns:

Name Type Description
Schema Schema

Converted Daft schema

Source code in daft/schema.py
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
    """Creates a Daft Schema from a PyArrow Schema.

    Args:
        pa_schema (pa.Schema): PyArrow schema to convert

    Returns:
        Schema: Converted Daft schema
    """
    return cls._from_field_name_and_types(
        [(pa_field.name, DataType.from_arrow_type(pa_field.type)) for pa_field in pa_schema]
    )

from_pydict #

from_pydict(fields: dict[str, DataType]) -> Schema
Source code in daft/schema.py
169
170
171
@classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
    return cls._from_fields([Field.create(k, v) for k, v in fields.items()])

to_name_set #

to_name_set() -> set[str]
Source code in daft/schema.py
141
142
def to_name_set(self) -> set[str]:
    return set(self.column_names())

to_pyarrow_schema #

to_pyarrow_schema() -> Schema

Converts a Daft Schema to a PyArrow Schema.

Returns:

Type Description
Schema

pa.Schema: PyArrow schema that corresponds to the provided Daft schema

Source code in daft/schema.py
89
90
91
92
93
94
95
96
def to_pyarrow_schema(self) -> pa.Schema:
    """Converts a Daft Schema to a PyArrow Schema.

    Returns:
        pa.Schema: PyArrow schema that corresponds to the provided Daft schema
    """
    _ensure_registered_super_ext_type()
    return self._schema.to_pyarrow_schema()

union #

union(other: Schema) -> Schema
Source code in daft/schema.py
160
161
162
163
164
def union(self, other: Schema) -> Schema:
    if not isinstance(other, Schema):
        raise ValueError(f"Expected Schema, got other: {type(other)}")

    return Schema._from_pyschema(self._schema.union(other._schema))