Cache extracted data to avoid repeated calls to slow or rate-limited sources like APIs. This example demonstrates caching HTTP responses from the GitHub API, so subsequent runs use cached data instead of making new requests.
Examples
Data frame
Description
Documentation
Code
<?php
declare(strict_types=1);
use function Flow\ETL\DSL\{config_builder, data_frame, filesystem_cache, from_cache, ref, rename_replace, to_output};
use Flow\ETL\Adapter\Http\DynamicExtractor\NextRequestFactory;
use Flow\ETL\Adapter\Http\PsrHttpClientDynamicExtractor;
use Http\Client\Curl\Client;
use Nyholm\Psr7\Factory\Psr17Factory;
use Psr\Http\Message\{RequestInterface, ResponseInterface};
require __DIR__ . '/vendor/autoload.php';
$factory = new Psr17Factory();
$client = new Client($factory, $factory);
$from_github_api = new PsrHttpClientDynamicExtractor($client, new class implements NextRequestFactory {
public function create(?ResponseInterface $previousResponse = null) : ?RequestInterface
{
$factory = new Psr17Factory();
if ($previousResponse === null) {
return $factory
->createRequest('GET', 'https://api.github.com/orgs/flow-php')
->withHeader('Accept', 'application/vnd.github.v3+json')
->withHeader('User-Agent', 'flow-php/etl');
}
return null;
}
});
data_frame(config_builder()->cache(filesystem_cache(__DIR__ . '/output/cache')))
->read(
from_cache(
id: 'github_api',
fallback_extractor: $from_github_api
)
)
->cache('github_api')
->withEntry('unpacked', ref('response_body')->jsonDecode())
->select('unpacked')
->withEntry('unpacked', ref('unpacked')->unpack())
->renameEach(rename_replace('unpacked.', ''))
->drop('unpacked')
->select('name', 'html_url', 'blog', 'login', 'public_repos', 'followers', 'created_at')
->write(to_output(truncate: false))
->run();