Schema#

Daft can display your DataFrame's schema without materializing it. Under the hood, it performs intelligent sampling of your data to determine the appropriate schema, and if you make any modifications to your DataFrame it can infer the resulting types based on the operation.

Schema #

Schema()

Methods:

Name	Description
`apply_hints`	Applies hints from another schema to this schema.
`column_names`	Returns a list of the names of the columns in the schema.
`display_with_metadata`	Returns a string representation of the schema, optionally including metadata.
`estimate_row_size_bytes`	Estimates the size of a row in bytes based on the schema.
`from_csv`	Creates a Schema from a CSV file.
`from_field_name_and_types`	Creates a Daft Schema from a list of field name and types.
`from_json`	Creates a Schema from a JSON file.
`from_parquet`	Creates a Schema from a Parquet file.
`from_pyarrow_schema`	Creates a Daft Schema from a PyArrow Schema.
`from_pydict`	Creates a Schema from a dictionary of field names and their corresponding DataTypes.
`min_estimated_size_column`	Returns the name of the column with the minimum estimated size.
`to_name_set`	Returns a set of column names in the schema.
`to_pyarrow_schema`	Converts a Daft Schema to a PyArrow Schema.
`union`	Creates a new Schema that is the union of this schema and another schema.

Source code in daft/schema.py

def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Schema via __init__ ")

apply_hints #

apply_hints(hints: Schema) -> Schema

Applies hints from another schema to this schema.

Parameters:

Name	Type	Description	Default
`hints`	`Schema`	Schema containing hints to apply to this schema.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A new Schema with the hints applied.

Source code in daft/schema.py

def apply_hints(self, hints: Schema) -> Schema:
    """Applies hints from another schema to this schema.

    Args:
        hints (Schema): Schema containing hints to apply to this schema.

    Returns:
        Schema: A new Schema with the hints applied.
    """
    return Schema._from_pyschema(self._schema.apply_hints(hints._schema))

column_names #

column_names() -> list[str]

Returns a list of the names of the columns in the schema.

Returns:

Type	Description
`list[str]`	list[str]: List of column names in the schema.

Source code in daft/schema.py

def column_names(self) -> list[str]:
    """Returns a list of the names of the columns in the schema.

    Returns:
        list[str]: List of column names in the schema.
    """
    return list(self._schema.names())

display_with_metadata #

display_with_metadata(include_metadata: bool = False) -> str

Returns a string representation of the schema, optionally including metadata.

Source code in daft/schema.py

def display_with_metadata(self, include_metadata: bool = False) -> str:
    """Returns a string representation of the schema, optionally including metadata."""
    return self._schema.display_with_metadata(include_metadata)

estimate_row_size_bytes #

estimate_row_size_bytes() -> float

Estimates the size of a row in bytes based on the schema.

Returns:

Name	Type	Description
`float`	`float`	Estimated size of a row in bytes.

Source code in daft/schema.py

def estimate_row_size_bytes(self) -> float:
    """Estimates the size of a row in bytes based on the schema.

    Returns:
        float: Estimated size of a row in bytes.
    """
    return self._schema.estimate_row_size_bytes()

from_csv #

from_csv(path: str, parse_options: CsvParseOptions | None = None, io_config: IOConfig | None = None, multithreaded_io: bool | None = None) -> Schema

Creates a Schema from a CSV file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the CSV file.	required
`parse_options`	`CsvParseOptions \| None`	Options for parsing the CSV file.	`None`
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the CSV file.

Source code in daft/schema.py

@classmethod
def from_csv(
    cls,
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    """Creates a Schema from a CSV file.

    Args:
        path (str): Path to the CSV file.
        parse_options (CsvParseOptions | None): Options for parsing the CSV file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.

    Returns:
        Schema: A Schema object representing the CSV file.
    """
    return Schema._from_pyschema(
        _read_csv_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_field_name_and_types #

from_field_name_and_types(fields: list[tuple[str, DataType]]) -> Schema

Creates a Daft Schema from a list of field name and types.

Parameters:

Name	Type	Description	Default
`fields`	`list[tuple[str, DataType]]`	List of field name and types	required

Returns:

Name	Type	Description
`Schema`	`Schema`	Daft schema with the provided field names and types

Source code in daft/schema.py

@classmethod
def from_field_name_and_types(cls, fields: list[tuple[str, DataType]]) -> Schema:
    """Creates a Daft Schema from a list of field name and types.

    Args:
        fields (list[tuple[str, DataType]]): List of field name and types

    Returns:
        Schema: Daft schema with the provided field names and types
    """
    return cls._from_field_name_and_types(fields)

from_json #

from_json(path: str, parse_options: JsonParseOptions | None = None, io_config: IOConfig | None = None, multithreaded_io: bool | None = None) -> Schema

Creates a Schema from a JSON file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the JSON file.	required
`parse_options`	`JsonParseOptions \| None`	Options for parsing the JSON file.	`None`
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the JSON file.

Source code in daft/schema.py

@classmethod
def from_json(
    cls,
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    """Creates a Schema from a JSON file.

    Args:
        path (str): Path to the JSON file.
        parse_options (JsonParseOptions | None): Options for parsing the JSON file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.

    Returns:
        Schema: A Schema object representing the JSON file.
    """
    return Schema._from_pyschema(
        _read_json_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_parquet #

from_parquet(path: str, io_config: IOConfig | None = None, multithreaded_io: bool | None = None, coerce_int96_timestamp_unit: TimeUnit = ns()) -> Schema

Creates a Schema from a Parquet file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the Parquet file.	required
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`
`coerce_int96_timestamp_unit`	`TimeUnit`	The time unit to coerce INT96 timestamps to.	`ns()`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the Parquet file.

Source code in daft/schema.py

@classmethod
def from_parquet(
    cls,
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
    """Creates a Schema from a Parquet file.

    Args:
        path (str): Path to the Parquet file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.
        coerce_int96_timestamp_unit (TimeUnit): The time unit to coerce INT96 timestamps to.

    Returns:
        Schema: A Schema object representing the Parquet file.
    """
    return Schema._from_pyschema(
        _read_parquet_schema(
            uri=path,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
        )
    )

from_pyarrow_schema #

from_pyarrow_schema(pa_schema: Schema) -> Schema

Creates a Daft Schema from a PyArrow Schema.

Parameters:

Name	Type	Description	Default
`pa_schema`	`Schema`	PyArrow schema to convert	required

Returns:

Name	Type	Description
`Schema`	`Schema`	Converted Daft schema

Source code in daft/schema.py

@classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
    """Creates a Daft Schema from a PyArrow Schema.

    Args:
        pa_schema (pa.Schema): PyArrow schema to convert

    Returns:
        Schema: Converted Daft schema
    """
    # NOTE: This does not retain schema-level metadata, as Daft Schemas do not have a metadata field.
    fields = []
    for pa_field in pa_schema:
        metadata = None
        if pa_field.metadata:
            metadata = {k.decode(): v.decode() for k, v in pa_field.metadata.items()}
        fields.append(Field.create(pa_field.name, DataType.from_arrow_type(pa_field.type), metadata))
    return cls._from_fields(fields)

from_pydict #

from_pydict(fields: dict[str, DataType]) -> Schema

Creates a Schema from a dictionary of field names and their corresponding DataTypes.

Parameters:

Name	Type	Description	Default
`fields`	`dict[str, DataType]`	Dictionary mapping field names to DataTypes.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object created from the provided fields.

Source code in daft/schema.py

@classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
    """Creates a Schema from a dictionary of field names and their corresponding DataTypes.

    Args:
        fields (dict[str, DataType]): Dictionary mapping field names to DataTypes.

    Returns:
        Schema: A Schema object created from the provided fields.
    """
    return cls._from_fields([Field.create(k, v) for k, v in fields.items()])

min_estimated_size_column #

min_estimated_size_column() -> str | None

Returns the name of the column with the minimum estimated size.

Source code in daft/schema.py

def min_estimated_size_column(self) -> str | None:
    """Returns the name of the column with the minimum estimated size."""
    return self._schema.min_estimated_size_column()

to_name_set #

to_name_set() -> set[str]

Returns a set of column names in the schema.

Returns:

Type	Description
`set[str]`	set[str]: Set of column names in the schema.

Source code in daft/schema.py

def to_name_set(self) -> set[str]:
    """Returns a set of column names in the schema.

    Returns:
        set[str]: Set of column names in the schema.
    """
    return set(self.column_names())

to_pyarrow_schema #

to_pyarrow_schema() -> Schema

Converts a Daft Schema to a PyArrow Schema.

Returns:

Type	Description
`Schema`	pa.Schema: PyArrow schema that corresponds to the provided Daft schema

Source code in daft/schema.py

def to_pyarrow_schema(self) -> pa.Schema:
    """Converts a Daft Schema to a PyArrow Schema.

    Returns:
        pa.Schema: PyArrow schema that corresponds to the provided Daft schema
    """
    _ensure_registered_super_ext_type()
    return self._schema.to_pyarrow_schema()

union #

union(other: Schema) -> Schema

Creates a new Schema that is the union of this schema and another schema.

Parameters:

Name	Type	Description	Default
`other`	`Schema`	The schema to union with this schema.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A new Schema that is the union of this schema and the other schema.

Source code in daft/schema.py

def union(self, other: Schema) -> Schema:
    """Creates a new Schema that is the union of this schema and another schema.

    Args:
        other (Schema): The schema to union with this schema.

    Returns:
        Schema: A new Schema that is the union of this schema and the other schema.
    """
    if not isinstance(other, Schema):
        raise ValueError(f"Expected Schema, got other: {type(other)}")

    return Schema._from_pyschema(self._schema.union(other._schema))

Schema has been moved to daft.schema but is still accessible at daft.logical.schema.

Schema #

Schema()

Methods:

Name	Description
`apply_hints`	Applies hints from another schema to this schema.
`column_names`	Returns a list of the names of the columns in the schema.
`display_with_metadata`	Returns a string representation of the schema, optionally including metadata.
`estimate_row_size_bytes`	Estimates the size of a row in bytes based on the schema.
`from_csv`	Creates a Schema from a CSV file.
`from_field_name_and_types`	Creates a Daft Schema from a list of field name and types.
`from_json`	Creates a Schema from a JSON file.
`from_parquet`	Creates a Schema from a Parquet file.
`from_pyarrow_schema`	Creates a Daft Schema from a PyArrow Schema.
`from_pydict`	Creates a Schema from a dictionary of field names and their corresponding DataTypes.
`min_estimated_size_column`	Returns the name of the column with the minimum estimated size.
`to_name_set`	Returns a set of column names in the schema.
`to_pyarrow_schema`	Converts a Daft Schema to a PyArrow Schema.
`union`	Creates a new Schema that is the union of this schema and another schema.

Source code in daft/schema.py

def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Schema via __init__ ")

apply_hints #

apply_hints(hints: Schema) -> Schema

Applies hints from another schema to this schema.

Parameters:

Name	Type	Description	Default
`hints`	`Schema`	Schema containing hints to apply to this schema.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A new Schema with the hints applied.

Source code in daft/schema.py

def apply_hints(self, hints: Schema) -> Schema:
    """Applies hints from another schema to this schema.

    Args:
        hints (Schema): Schema containing hints to apply to this schema.

    Returns:
        Schema: A new Schema with the hints applied.
    """
    return Schema._from_pyschema(self._schema.apply_hints(hints._schema))

column_names #

column_names() -> list[str]

Returns a list of the names of the columns in the schema.

Returns:

Type	Description
`list[str]`	list[str]: List of column names in the schema.

Source code in daft/schema.py

def column_names(self) -> list[str]:
    """Returns a list of the names of the columns in the schema.

    Returns:
        list[str]: List of column names in the schema.
    """
    return list(self._schema.names())

display_with_metadata #

display_with_metadata(include_metadata: bool = False) -> str

Returns a string representation of the schema, optionally including metadata.

Source code in daft/schema.py

def display_with_metadata(self, include_metadata: bool = False) -> str:
    """Returns a string representation of the schema, optionally including metadata."""
    return self._schema.display_with_metadata(include_metadata)

estimate_row_size_bytes #

estimate_row_size_bytes() -> float

Estimates the size of a row in bytes based on the schema.

Returns:

Name	Type	Description
`float`	`float`	Estimated size of a row in bytes.

Source code in daft/schema.py

def estimate_row_size_bytes(self) -> float:
    """Estimates the size of a row in bytes based on the schema.

    Returns:
        float: Estimated size of a row in bytes.
    """
    return self._schema.estimate_row_size_bytes()

from_csv #

from_csv(path: str, parse_options: CsvParseOptions | None = None, io_config: IOConfig | None = None, multithreaded_io: bool | None = None) -> Schema

Creates a Schema from a CSV file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the CSV file.	required
`parse_options`	`CsvParseOptions \| None`	Options for parsing the CSV file.	`None`
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the CSV file.

Source code in daft/schema.py

@classmethod
def from_csv(
    cls,
    path: str,
    parse_options: CsvParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    """Creates a Schema from a CSV file.

    Args:
        path (str): Path to the CSV file.
        parse_options (CsvParseOptions | None): Options for parsing the CSV file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.

    Returns:
        Schema: A Schema object representing the CSV file.
    """
    return Schema._from_pyschema(
        _read_csv_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_field_name_and_types #

from_field_name_and_types(fields: list[tuple[str, DataType]]) -> Schema

Creates a Daft Schema from a list of field name and types.

Parameters:

Name	Type	Description	Default
`fields`	`list[tuple[str, DataType]]`	List of field name and types	required

Returns:

Name	Type	Description
`Schema`	`Schema`	Daft schema with the provided field names and types

Source code in daft/schema.py

@classmethod
def from_field_name_and_types(cls, fields: list[tuple[str, DataType]]) -> Schema:
    """Creates a Daft Schema from a list of field name and types.

    Args:
        fields (list[tuple[str, DataType]]): List of field name and types

    Returns:
        Schema: Daft schema with the provided field names and types
    """
    return cls._from_field_name_and_types(fields)

from_json #

from_json(path: str, parse_options: JsonParseOptions | None = None, io_config: IOConfig | None = None, multithreaded_io: bool | None = None) -> Schema

Creates a Schema from a JSON file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the JSON file.	required
`parse_options`	`JsonParseOptions \| None`	Options for parsing the JSON file.	`None`
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the JSON file.

Source code in daft/schema.py

@classmethod
def from_json(
    cls,
    path: str,
    parse_options: JsonParseOptions | None = None,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
) -> Schema:
    """Creates a Schema from a JSON file.

    Args:
        path (str): Path to the JSON file.
        parse_options (JsonParseOptions | None): Options for parsing the JSON file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.

    Returns:
        Schema: A Schema object representing the JSON file.
    """
    return Schema._from_pyschema(
        _read_json_schema(
            uri=path,
            parse_options=parse_options,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
        )
    )

from_parquet #

from_parquet(path: str, io_config: IOConfig | None = None, multithreaded_io: bool | None = None, coerce_int96_timestamp_unit: TimeUnit = ns()) -> Schema

Creates a Schema from a Parquet file.

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the Parquet file.	required
`io_config`	`IOConfig \| None`	IO configuration for reading the file.	`None`
`multithreaded_io`	`bool \| None`	Whether to use multithreaded IO.	`None`
`coerce_int96_timestamp_unit`	`TimeUnit`	The time unit to coerce INT96 timestamps to.	`ns()`

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object representing the Parquet file.

Source code in daft/schema.py

@classmethod
def from_parquet(
    cls,
    path: str,
    io_config: IOConfig | None = None,
    multithreaded_io: bool | None = None,
    coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
    """Creates a Schema from a Parquet file.

    Args:
        path (str): Path to the Parquet file.
        io_config (IOConfig | None): IO configuration for reading the file.
        multithreaded_io (bool | None): Whether to use multithreaded IO.
        coerce_int96_timestamp_unit (TimeUnit): The time unit to coerce INT96 timestamps to.

    Returns:
        Schema: A Schema object representing the Parquet file.
    """
    return Schema._from_pyschema(
        _read_parquet_schema(
            uri=path,
            io_config=io_config,
            multithreaded_io=multithreaded_io,
            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
        )
    )

from_pyarrow_schema #

from_pyarrow_schema(pa_schema: Schema) -> Schema

Creates a Daft Schema from a PyArrow Schema.

Parameters:

Name	Type	Description	Default
`pa_schema`	`Schema`	PyArrow schema to convert	required

Returns:

Name	Type	Description
`Schema`	`Schema`	Converted Daft schema

Source code in daft/schema.py

@classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
    """Creates a Daft Schema from a PyArrow Schema.

    Args:
        pa_schema (pa.Schema): PyArrow schema to convert

    Returns:
        Schema: Converted Daft schema
    """
    # NOTE: This does not retain schema-level metadata, as Daft Schemas do not have a metadata field.
    fields = []
    for pa_field in pa_schema:
        metadata = None
        if pa_field.metadata:
            metadata = {k.decode(): v.decode() for k, v in pa_field.metadata.items()}
        fields.append(Field.create(pa_field.name, DataType.from_arrow_type(pa_field.type), metadata))
    return cls._from_fields(fields)

from_pydict #

from_pydict(fields: dict[str, DataType]) -> Schema

Creates a Schema from a dictionary of field names and their corresponding DataTypes.

Parameters:

Name	Type	Description	Default
`fields`	`dict[str, DataType]`	Dictionary mapping field names to DataTypes.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A Schema object created from the provided fields.

Source code in daft/schema.py

@classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
    """Creates a Schema from a dictionary of field names and their corresponding DataTypes.

    Args:
        fields (dict[str, DataType]): Dictionary mapping field names to DataTypes.

    Returns:
        Schema: A Schema object created from the provided fields.
    """
    return cls._from_fields([Field.create(k, v) for k, v in fields.items()])

min_estimated_size_column #

min_estimated_size_column() -> str | None

Returns the name of the column with the minimum estimated size.

Source code in daft/schema.py

def min_estimated_size_column(self) -> str | None:
    """Returns the name of the column with the minimum estimated size."""
    return self._schema.min_estimated_size_column()

to_name_set #

to_name_set() -> set[str]

Returns a set of column names in the schema.

Returns:

Type	Description
`set[str]`	set[str]: Set of column names in the schema.

Source code in daft/schema.py

def to_name_set(self) -> set[str]:
    """Returns a set of column names in the schema.

    Returns:
        set[str]: Set of column names in the schema.
    """
    return set(self.column_names())

to_pyarrow_schema #

to_pyarrow_schema() -> Schema

Converts a Daft Schema to a PyArrow Schema.

Returns:

Type	Description
`Schema`	pa.Schema: PyArrow schema that corresponds to the provided Daft schema

Source code in daft/schema.py

def to_pyarrow_schema(self) -> pa.Schema:
    """Converts a Daft Schema to a PyArrow Schema.

    Returns:
        pa.Schema: PyArrow schema that corresponds to the provided Daft schema
    """
    _ensure_registered_super_ext_type()
    return self._schema.to_pyarrow_schema()

union #

union(other: Schema) -> Schema

Creates a new Schema that is the union of this schema and another schema.

Parameters:

Name	Type	Description	Default
`other`	`Schema`	The schema to union with this schema.	required

Returns:

Name	Type	Description
`Schema`	`Schema`	A new Schema that is the union of this schema and the other schema.

Source code in daft/schema.py

def union(self, other: Schema) -> Schema:
    """Creates a new Schema that is the union of this schema and another schema.

    Args:
        other (Schema): The schema to union with this schema.

    Returns:
        Schema: A new Schema that is the union of this schema and the other schema.
    """
    if not isinstance(other, Schema):
        raise ValueError(f"Expected Schema, got other: {type(other)}")

    return Schema._from_pyschema(self._schema.union(other._schema))