I referred to below link: How to upload folder on Google Cloud Storage using Python API
I wanted create a script for folder upload to GCS asynchronously same as gustil Rsync, but in python for excluding filetypes image and video.
I have a script to upload a folder to GCS bucket synchronously.
import glob
import os
from google.cloud import storage
from gcloud import storage
from gcloud.aio.storage import Storage
from oauth2client.service_account import ServiceAccountCredentials
import time
#GCS_CLIENT = storage.Client()
credentials_dict = {
XXXXXX
}
credentials = ServiceAccountCredentials.from_json_keyfile_dict(
credentials_dict
)
GCS_CLIENT = storage.Client(credentials=credentials, project='XXXXXXXXXXXX')
def upload_from_directory(directory_path: str, dest_bucket_name: str, dest_blob_name: str):
rel_paths = glob.glob(directory_path + '/**', recursive=True)
bucket = GCS_CLIENT.get_bucket(dest_bucket_name)
for local_file in rel_paths:
remote_path = f'{dest_blob_name}/{"/".join(local_file.split(os.sep)[1:])}'
if os.path.isfile(local_file):
blob = bucket.blob(remote_path)
blob.upload_from_filename(local_file)
This is the Calling Function:
s = time.time()
upload_from_directory("C:/Users/New folder","XXXXXXXXXX","New folder")
print(time.time()-s)
This does works for me but it takes more time, I have 5 TB of data, I want to do it in Parallely. I tried using Joblib to parallelize the function, but it gave me this error.
PicklingError: Could not pickle the task to send it to the workers.
This is my Joblib code:
import glob
import os
from google.cloud import storage
from gcloud import storage
from gcloud.aio.storage import Storage
from oauth2client.service_account import ServiceAccountCredentials
import time
#GCS_CLIENT = storage.Client()
credentials_dict = {
XXXXXXXX
}
credentials = ServiceAccountCredentials.from_json_keyfile_dict(
credentials_dict
)
GCS_CLIENT = storage.Client(credentials=credentials, project='XXXXXXXXXX')
def upload_from_directory(local_file):
dest_bucket_name = "XXXXXXXXXX"
dest_blob_name = "New folder"
bucket = GCS_CLIENT.get_bucket(dest_bucket_name)
remote_path = f'{dest_blob_name}/{"/".join(local_file.split(os.sep)[1:])}'
if os.path.isfile(local_file):
blob = bucket.blob(remote_path)
blob.upload_from_filename(local_file)
Below is the Multithreading function:
from joblib import Parallel, delayed
s = time.time()
directory_path = "C:/Users/New folder"
rel_paths = glob.glob(directory_path + '/**', recursive=True)
Parallel(n_jobs=8)(delayed(upload_from_directory)(local_file)for local_file in rel_paths)
print(time.time() - s)