Skip to content
Search
Examples

Schema

Description

Automatically detect the schema from your data. Inferred schemas can be saved and reused to speed up subsequent processing by avoiding repeated schema detection.

Documentation

Code

<?php

declare(strict_types=1);

use function Flow\ETL\Adapter\CSV\from_csv;
use function Flow\ETL\DSL\{data_frame, schema_from_json, schema_to_json, to_output};
use Flow\ETL\Loader\StreamLoader\Output;
use function Flow\Filesystem\DSL\fstab;
use function Flow\Filesystem\DSL\path;

require __DIR__ . '/vendor/autoload.php';

$fs = fstab()->for('file');

if ($fs->status(path(__DIR__ . '/output/schema.json')) === null) {
    $schema = data_frame()
        ->read(from_csv(__DIR__ . '/input/dataset.csv'))
        ->limit(100) // Limiting the number of rows to read will speed up the process but might bring less accurate results
        ->autoCast()
        ->schema();

    $fs->writeTo(path(__DIR__ . '/output/schema.json'))
        ->append(schema_to_json($schema))
        ->close();
} else {
    $schema = schema_from_json($fs->readFrom(path(__DIR__ . '/output/schema.json'))->content());
}

// Reading schemaless data formats with predefined schema can significantly improve performance
data_frame()
    ->read(from_csv(__DIR__ . '/input/dataset.csv', schema: $schema))
    ->collect()
    ->write(to_output(truncate: false, output: Output::rows_and_schema))
    ->run();
Contributors

Built in the open.

Join us on GitHub
scroll back to top