0

I have code as below -

import java.io.InputStream
import java.net.URI
import java.util

import com.amazonaws.auth.AWSCredentialsProviderChain
import com.amazonaws.{ClientConfiguration, Protocol}
import com.amazonaws.services.s3.AmazonS3Client
import com.amazonaws.services.s3.model._
import org.apache.hadoop.conf.{Configuration => HadoopConfiguration}
import org.apache.hadoop.fs.{Path => HadoopPath}
import org.apache.hadoop.fs.s3a.{BasicAWSCredentialsProvider, S3AFileSystem}
import com.amazonaws.services.s3.model.ObjectListing

import scala.annotation.tailrec

object FileOperation {

  val uri = new URI("s3a://bucket-name/prefixKey/file.json")
  val fs: S3AFileSystem = new S3AFileSystem()

  def getAWSClient: AmazonS3Client = {
    val conf: HadoopConfiguration = new HadoopConfiguration(true)
    val awsConf: ClientConfiguration = new ClientConfiguration()

    val secureConnections: Boolean = conf.getBoolean("fs.s3a.connection.ssl.enabled", false)
    awsConf.setProtocol(if (secureConnections) Protocol.HTTPS else Protocol.HTTP)

    val accessKey: String = conf.get("fs.s3a.access.key", null.asInstanceOf[String])
    val secretKey: String = conf.get("fs.s3a.secret.key", null.asInstanceOf[String])

    println(s"inside getAWSClient accessKey -> $accessKey ; secretKey -> $secretKey")

    val credentials = new AWSCredentialsProviderChain(new BasicAWSCredentialsProvider(accessKey, secretKey))

    val s3: AmazonS3Client = new AmazonS3Client(credentials, awsConf)
    s3.setEndpoint(conf.get("fs.s3a.endpoint", null.asInstanceOf[String]))
    s3
  }

  def getEntries(recursive: Boolean): Seq[URI] = {

    @tailrec
    def collectEntries(summaries: util.Iterator[S3ObjectSummary], collected: Seq[HadoopPath]): Seq[HadoopPath] = {
      if (summaries.hasNext) {
        val summary: S3ObjectSummary = summaries.next()
        val newPath: String = "s3a://" + summary.getBucketName + "/" + summary.getKey

        collectEntries(summaries, collected :+ {
          fs.initialize(new URI(newPath), new HadoopConfiguration(true))
          new HadoopPath(new URI(newPath))
        })
      } else collected
    }

    val prefixKey: String = if (!(uri.getScheme != null && uri.getPath.isEmpty)) uri.getPath.substring(1) else ""

    val objects: ObjectListing = getAWSClient.listObjects(uri.getHost, prefixKey)

    if (objects.getObjectSummaries.isEmpty) throw new java.nio.file.NoSuchFileException(uri.toString)
    else {
      collectEntries(objects.getObjectSummaries.iterator(), Seq.empty).map(path => path.toUri)
    }

  }

  def asStream: InputStream = {
    val prefixKey1 = if (!(uri.getScheme != null && uri.getPath.isEmpty)) uri.getPath.substring(1) else ""
    val s3object: S3Object = getAWSClient.getObject(new GetObjectRequest(uri.getHost, prefixKey1))
    s3object.getObjectContent
  }

}

the call to function getAWSClient in the function getEntries work but the call in asStream function gets empty access Keys and secret key. getEntries function is used to list files under a folder whereas asStream function returns the input stream of such files which is used to create a BufferedSource and then the content is read.

Below is the hadoop core-ste.xml file. Can someone help me usnderstand why the access and secret keys are appearing null in the asStream function.

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Sample config for Hadoop S3A client. -->
<configuration>

    <property>
        <name>fs.s3a.access.key</name>
        <value>xxxxxxxx</value>
    </property>
    <property>
        <name>fs.s3a.secret.key</name>
        <value>yyyyyyy</value>
    </property>
    <property>
        <name>fs.s3a.connection.ssl.enabled</name>
        <value>false</value>
    </property>
    <property>
        <name>fs.s3a.endpoint</name>
        <value>dev:80</value>
    </property>
    <property>
        <name>fs.s3a.imp</name>
        <value>org.apache.hadoop.fs.s3a.S3A</value>
    </property>
    <property>
        <name>fs.s3a.path.style.access</name>
        <value>true</value>
    </property>
</configuration>
  • In the getAWSClient function, if I hardcode the values of accesskey and secretkey both my function getEntries and asStream works fine. But that is not what I can do in my application that would be put to production – Manoj Kumar Mar 22 '18 at 09:43

1 Answers1

0
  1. the S3A connector doesn't serve up its secrets because they are, well, "secret".
  2. And the S3AFileSystem class doesn't expect initialize() to be called more than once. Expensive things like thread pools and an AWS Transfer manager are created, and only cleaned up in the FileSystem.close() call. Your code is going to leak these. If we'd known people were trying to do that (and now we do!), we'll add a check and fail fast.

If you call FileSystem.listFiles(path, true) you get recursive list of all objects under a path with only one HTTP request made of S3 per thousand descendant entries; this what you seem to be doing with listObjects.

Oh, and if you want the config the FS is using, call fs.getConf(). That will contain the secrets if they were passed in the XML file. If they are saved in JCECKs files or other secure storage, its a bit tricker to pick up.

stevel
  • 12,567
  • 1
  • 39
  • 50