I am uploading a gzip file into S3 bucket using java application, the data of which will be used in Athena. The gzip file is getting corrupted while uploading. Due to which Athena is unable to view the data from the gzip file, also when the file is downloaded and manually tried to unzip, it says 'it is not a gzip file'.
private void getAndProcessFilesGenReports(String parUrl, String custCode, long size, String queryDate) {
try (CloseableHttpClient httpclient = HttpClientBuilder.create().setDefaultCredentialsProvider(getCredentialsProvider()).build();) {
CloseableHttpResponse response;
HttpGet httpget = new HttpGet(BASE_URI.concat(parUrl));
response = httpclient.execute(httpget);
httpget.setConfig(config);
response.getStatusLine().getStatusCode(), response.getStatusLine().getReasonPhrase());
if (response.getStatusLine().getStatusCode() != 200) {
log.error("getAndProcessFilesGenReports partUrl could not get response for custCode---> {}", custCode);
}
if (response.getStatusLine().getStatusCode() == 200) {
GZIPInputStream gzis = new GZIPInputStream(response.getEntity().getContent());
String bucketName = bucketForDetailedBilling(GEN_REPORT_TYPE, custCode, queryDate);
uploadGzipFileToS3(gzis, size, bucketName);
}
} catch (Exception e) {
log.error("error in getAndProcessFilesGenReports()--->", e);
}
}
private void uploadGzipFileToS3(InputStream gzis, long size, String bucketName) {
log.info("uploadGzipFileToS3 size{} --- bucketName {}--->", size, bucketName);
ClientConfiguration clientConfiguration = new ClientConfiguration();
clientConfiguration.setConnectionMaxIdleMillis(600000);
clientConfiguration.setConnectionTimeout(600000);
clientConfiguration.setClientExecutionTimeout(600000);
clientConfiguration.setUseGzip(true);
clientConfiguration.setConnectionTTL(1000 * 60 * 60);
AmazonS3Client amazonS3Client = new AmazonS3Client(clientConfiguration);
TransferManager transferManager = new TransferManager(amazonS3Client);
try {
ObjectMetadata objectMetadata = new ObjectMetadata();
objectMetadata.setContentLength(size);
transferManager.getConfiguration().setMultipartUploadThreshold(1024 * 5);
PutObjectRequest request = new PutObjectRequest(bucketName, DBR_NAME + DBR_EXT, gzis, objectMetadata);
request.getRequestClientOptions().setReadLimit(1024 * 5 + 1);
request.setSdkClientExecutionTimeout(10000 * 60 * 60);
Upload upload = transferManager.upload(request);
upload.waitForCompletion();
}`