Schema
Daft can display your DataFrame's schema without materializing it. Under the hood, it performs intelligent sampling of your data to determine the appropriate schema, and if you make any modifications to your DataFrame it can infer the resulting types based on the operation. Learn more about Schemas in Daft User Guide.
Schema
Methods:
Source code in daft/schema.py
| def __init__(self) -> None:
raise NotImplementedError("We do not support creating a Schema via __init__ ")
|
apply_hints
Source code in daft/schema.py
| def apply_hints(self, hints: Schema) -> Schema:
return Schema._from_pyschema(self._schema.apply_hints(hints._schema))
|
column_names
column_names() -> list[str]
Source code in daft/schema.py
| def column_names(self) -> list[str]:
return list(self._schema.names())
|
estimate_row_size_bytes
estimate_row_size_bytes() -> float
Source code in daft/schema.py
| def estimate_row_size_bytes(self) -> float:
return self._schema.estimate_row_size_bytes()
|
from_csv
from_csv(
path: str,
parse_options: CsvParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 | @classmethod
def from_csv(
cls,
path: str,
parse_options: CsvParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema:
return Schema._from_pyschema(
_read_csv_schema(
uri=path,
parse_options=parse_options,
io_config=io_config,
multithreaded_io=multithreaded_io,
)
)
|
from_json
from_json(
path: str,
parse_options: JsonParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222 | @classmethod
def from_json(
cls,
path: str,
parse_options: JsonParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema:
return Schema._from_pyschema(
_read_json_schema(
uri=path,
parse_options=parse_options,
io_config=io_config,
multithreaded_io=multithreaded_io,
)
)
|
from_parquet
from_parquet(
path: str,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = ns(),
) -> Schema
Source code in daft/schema.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188 | @classmethod
def from_parquet(
cls,
path: str,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
return Schema._from_pyschema(
_read_parquet_schema(
uri=path,
io_config=io_config,
multithreaded_io=multithreaded_io,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
)
)
|
from_pyarrow_schema
from_pyarrow_schema(pa_schema: Schema) -> Schema
Creates a Daft Schema from a PyArrow Schema.
Parameters:
Name | Type | Description | Default |
pa_schema | Schema | PyArrow schema to convert | required |
Returns:
Name | Type | Description |
Schema | Schema | |
Source code in daft/schema.py
75
76
77
78
79
80
81
82
83
84
85
86
87 | @classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
"""Creates a Daft Schema from a PyArrow Schema.
Args:
pa_schema (pa.Schema): PyArrow schema to convert
Returns:
Schema: Converted Daft schema
"""
return cls._from_field_name_and_types(
[(pa_field.name, DataType.from_arrow_type(pa_field.type)) for pa_field in pa_schema]
)
|
from_pydict
Source code in daft/schema.py
| @classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
return cls._from_fields([Field.create(k, v) for k, v in fields.items()])
|
to_name_set
to_name_set() -> set[str]
Source code in daft/schema.py
| def to_name_set(self) -> set[str]:
return set(self.column_names())
|
to_pyarrow_schema
to_pyarrow_schema() -> Schema
Converts a Daft Schema to a PyArrow Schema.
Returns:
Type | Description |
Schema | pa.Schema: PyArrow schema that corresponds to the provided Daft schema |
Source code in daft/schema.py
| def to_pyarrow_schema(self) -> pa.Schema:
"""Converts a Daft Schema to a PyArrow Schema.
Returns:
pa.Schema: PyArrow schema that corresponds to the provided Daft schema
"""
_ensure_registered_super_ext_type()
return self._schema.to_pyarrow_schema()
|
union
Source code in daft/schema.py
| def union(self, other: Schema) -> Schema:
if not isinstance(other, Schema):
raise ValueError(f"Expected Schema, got other: {type(other)}")
return Schema._from_pyschema(self._schema.union(other._schema))
|
Schema has been moved to daft.schema
but is still accessible at daft.logical.schema
.
Schema
Methods:
Source code in daft/schema.py
| def __init__(self) -> None:
raise NotImplementedError("We do not support creating a Schema via __init__ ")
|
apply_hints
Source code in daft/schema.py
| def apply_hints(self, hints: Schema) -> Schema:
return Schema._from_pyschema(self._schema.apply_hints(hints._schema))
|
column_names
column_names() -> list[str]
Source code in daft/schema.py
| def column_names(self) -> list[str]:
return list(self._schema.names())
|
estimate_row_size_bytes
estimate_row_size_bytes() -> float
Source code in daft/schema.py
| def estimate_row_size_bytes(self) -> float:
return self._schema.estimate_row_size_bytes()
|
from_csv
from_csv(
path: str,
parse_options: CsvParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 | @classmethod
def from_csv(
cls,
path: str,
parse_options: CsvParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema:
return Schema._from_pyschema(
_read_csv_schema(
uri=path,
parse_options=parse_options,
io_config=io_config,
multithreaded_io=multithreaded_io,
)
)
|
from_json
from_json(
path: str,
parse_options: JsonParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema
Source code in daft/schema.py
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222 | @classmethod
def from_json(
cls,
path: str,
parse_options: JsonParseOptions | None = None,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
) -> Schema:
return Schema._from_pyschema(
_read_json_schema(
uri=path,
parse_options=parse_options,
io_config=io_config,
multithreaded_io=multithreaded_io,
)
)
|
from_parquet
from_parquet(
path: str,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = ns(),
) -> Schema
Source code in daft/schema.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188 | @classmethod
def from_parquet(
cls,
path: str,
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
) -> Schema:
return Schema._from_pyschema(
_read_parquet_schema(
uri=path,
io_config=io_config,
multithreaded_io=multithreaded_io,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
)
)
|
from_pyarrow_schema
from_pyarrow_schema(pa_schema: Schema) -> Schema
Creates a Daft Schema from a PyArrow Schema.
Parameters:
Name | Type | Description | Default |
pa_schema | Schema | PyArrow schema to convert | required |
Returns:
Name | Type | Description |
Schema | Schema | |
Source code in daft/schema.py
75
76
77
78
79
80
81
82
83
84
85
86
87 | @classmethod
def from_pyarrow_schema(cls, pa_schema: pa.Schema) -> Schema:
"""Creates a Daft Schema from a PyArrow Schema.
Args:
pa_schema (pa.Schema): PyArrow schema to convert
Returns:
Schema: Converted Daft schema
"""
return cls._from_field_name_and_types(
[(pa_field.name, DataType.from_arrow_type(pa_field.type)) for pa_field in pa_schema]
)
|
from_pydict
Source code in daft/schema.py
| @classmethod
def from_pydict(cls, fields: dict[str, DataType]) -> Schema:
return cls._from_fields([Field.create(k, v) for k, v in fields.items()])
|
to_name_set
to_name_set() -> set[str]
Source code in daft/schema.py
| def to_name_set(self) -> set[str]:
return set(self.column_names())
|
to_pyarrow_schema
to_pyarrow_schema() -> Schema
Converts a Daft Schema to a PyArrow Schema.
Returns:
Type | Description |
Schema | pa.Schema: PyArrow schema that corresponds to the provided Daft schema |
Source code in daft/schema.py
| def to_pyarrow_schema(self) -> pa.Schema:
"""Converts a Daft Schema to a PyArrow Schema.
Returns:
pa.Schema: PyArrow schema that corresponds to the provided Daft schema
"""
_ensure_registered_super_ext_type()
return self._schema.to_pyarrow_schema()
|
union
Source code in daft/schema.py
| def union(self, other: Schema) -> Schema:
if not isinstance(other, Schema):
raise ValueError(f"Expected Schema, got other: {type(other)}")
return Schema._from_pyschema(self._schema.union(other._schema))
|