Bulk Data Remote
This guide explains how to download bulk data from the Carbon Arc API and upload it directly to your AWS S3 bucket.
Bulk Data Remote - S3 Example
This guide shows how to download bulk data directly to an Amazon S3 bucket using the carbonarc
Python package.
To download to Azure, use client.data.download_data_to_azure
.
To download to GCP, use client.data.download_data_to_gcp
.
Import required dependencies
import os
from dotenv import load_dotenv
from carbonarc import CarbonArcClient
# Load environment variables
load_dotenv()
# AWS Python SDK
import boto3
Read in environment variables
API_AUTH_TOKEN = os.getenv("API_AUTH_TOKEN")
S3_BUCKET = os.getenv("S3_BUCKET")
Create the Carbon Arc client
client = CarbonArcClient(API_AUTH_TOKEN)
Initialize the AWS S3 client
s3_client = boto3.client(
's3',
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
aws_session_token=os.environ.get('AWS_SESSION_TOKEN'), # Optional for temporary credentials
region_name=os.environ.get('AWS_REGION', 'us-east-1') # default to us-east-1
)
Define the data identifiers to download
Read in environment variables
API_AUTH_TOKEN=os.getenv("API_AUTH_TOKEN") S3_BUCKET=os.getenv("S3_BUCKET")
Create API Client
ca = CarbonArcClient(API_AUTH_TOKEN)
DATA_IDENTIFIERS = {
"CA0031": {
"outputdir": "output/CA0031",
"filters": {},
}
}
# Initialize S3 client with environment variables
s3_client = boto3.client(
's3',
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
aws_session_token=os.environ.get('AWS_SESSION_TOKEN'), # Optional, for temporary credentials
region_name=os.environ.get('AWS_REGION', 'us-east-1') # Default to us-east-1 if not specified
)
# Download data for each data identifier
assert isinstance(S3_BUCKET, str), "S3_BUCKET must be set in environment variables"
for data_id, data in DATA_IDENTIFIERS.items():
# print(f"Downloading data for {data_id}")
params = data["filters"]
outputdir = data["outputdir"]
# Get data manifest, this will contain all the files that can be downloaded
# You can track the downloaded files to maintain ingestion state
manifest = ca.data.get_data_manifest(data_id)
print(f"Data id: {data_id}, total files: {len(manifest['files'])}")
# print(f"Manifest: {manifest}")
# Download all files in the manifest, this can be done in parallel to speed up the process
for file in manifest["files"]:
# Download the file to the output directory
print(f"Downloading file {file}...")
print(f"{file['size_bytes']/1024/1024} MB")
# Download the file to S3
ca.data.download_data_to_s3(
s3_client,
file["url"],
s3_bucket=S3_BUCKET,
s3_key_prefix=outputdir,
)
print(f"Downloaded all files for {data_id}")
Code
Notes
- You can track the downloaded files separately to manage your ingestion state.
- If your manifest is very large, you may consider parallelizing downloads for performance.
- Ensure your AWS credentials have correct permissions to upload to the desired S3 bucket.