Skip to main content

Bulk Data Remote

This guide explains how to download bulk data from the Carbon Arc API and upload it directly to your AWS S3 bucket.

Bulk Data Remote - S3 Example

This guide shows how to download bulk data directly to an Amazon S3 bucket using the carbonarc Python package.
To download to Azure, use client.data.download_data_to_azure.
To download to GCP, use client.data.download_data_to_gcp.


Import required dependencies

import os
from dotenv import load_dotenv
from carbonarc import CarbonArcClient

# Load environment variables
load_dotenv()

# AWS Python SDK
import boto3

Read in environment variables

API_AUTH_TOKEN = os.getenv("API_AUTH_TOKEN")
S3_BUCKET = os.getenv("S3_BUCKET")

Create the Carbon Arc client

client = CarbonArcClient(API_AUTH_TOKEN)

Initialize the AWS S3 client

s3_client = boto3.client(
's3',
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
aws_session_token=os.environ.get('AWS_SESSION_TOKEN'), # Optional for temporary credentials
region_name=os.environ.get('AWS_REGION', 'us-east-1') # default to us-east-1
)

Define the data identifiers to download

Read in environment variables

API_AUTH_TOKEN=os.getenv("API_AUTH_TOKEN") S3_BUCKET=os.getenv("S3_BUCKET")

Create API Client

ca = CarbonArcClient(API_AUTH_TOKEN)

DATA_IDENTIFIERS = {
"CA0031": {
"outputdir": "output/CA0031",
"filters": {},
}
}
# Initialize S3 client with environment variables
s3_client = boto3.client(
's3',
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
aws_session_token=os.environ.get('AWS_SESSION_TOKEN'), # Optional, for temporary credentials
region_name=os.environ.get('AWS_REGION', 'us-east-1') # Default to us-east-1 if not specified
)
# Download data for each data identifier
assert isinstance(S3_BUCKET, str), "S3_BUCKET must be set in environment variables"


for data_id, data in DATA_IDENTIFIERS.items():
# print(f"Downloading data for {data_id}")
params = data["filters"]
outputdir = data["outputdir"]

# Get data manifest, this will contain all the files that can be downloaded
# You can track the downloaded files to maintain ingestion state
manifest = ca.data.get_data_manifest(data_id)
print(f"Data id: {data_id}, total files: {len(manifest['files'])}")
# print(f"Manifest: {manifest}")

# Download all files in the manifest, this can be done in parallel to speed up the process
for file in manifest["files"]:
# Download the file to the output directory
print(f"Downloading file {file}...")
print(f"{file['size_bytes']/1024/1024} MB")

# Download the file to S3
ca.data.download_data_to_s3(
s3_client,
file["url"],
s3_bucket=S3_BUCKET,
s3_key_prefix=outputdir,

)

print(f"Downloaded all files for {data_id}")

Code

View on GitHub

Notes

  • You can track the downloaded files separately to manage your ingestion state.
  • If your manifest is very large, you may consider parallelizing downloads for performance.
  • Ensure your AWS credentials have correct permissions to upload to the desired S3 bucket.