Schema inferring is a process of generating a schema from a whole or part of the dataset.
Once schema is inferred, it can be saved and used to speed up next dataset processing.
All Extractors will try to auto-detect schema however, providing schema explicitly is always a good practice
since it can significantly speed up an extraction process by avoiding expensive schema detection.
Code
<?php
declare(strict_types=1);
use function Flow\ETL\Adapter\CSV\from_csv;
use function Flow\ETL\DSL\{data_frame, schema_from_json, schema_to_json, to_stream};
use Flow\ETL\Loader\StreamLoader\Output;
require __DIR__ . '/../../../autoload.php';
if (!\file_exists(__DIR__ . '/output/schema.json')) {
$schema = data_frame()
->read(from_csv(__DIR__ . '/input/dataset.csv'))
->limit(100) // Limiting the number of rows to read will speed up the process but might bring less accurate results
->autoCast()
->schema();
\file_put_contents(__DIR__ . '/output/schema.json', schema_to_json($schema));
} else {
/* @phpstan-ignore-next-line */
$schema = schema_from_json(\file_get_contents(__DIR__ . '/output/schema.json'));
}
// Reading schemaless data formats with predefined schema can significantly improve performance
data_frame()
->read(from_csv(__DIR__ . '/input/dataset.csv', schema: $schema))
->collect()
->write(to_stream(__DIR__ . '/output.txt', truncate: false, output: Output::rows_and_schema))
->run();