0

I've stored a file in Swift (Bluemix Objectstore) and I'm trying to access it from a Python application running on Spark within Bluemix.

I can successfully access the file in a Python notebook running in the same environment, but the access fails when done within a submitted Python application with an org.apache.hadoop.fs.swift.exceptions.SwiftConfigurationException: Invalid host name error.

Here's the Python code:

# Added to the Python job but not part of the notebook

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)

# Verified in the notebook

from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import udf
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
import sys
import numpy as np
import pandas as pd
import time
import datetime

def set_hadoop_config(credentials):
    """This function sets the Hadoop configuration with given credentials, 
    so it is possible to access data using SparkContext"""

    prefix = "fs.swift.service." + credentials['name']
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + ".auth.url", credentials['auth_url']+'/v3/auth/tokens')
    hconf.set(prefix + ".auth.endpoint.prefix", "endpoints")
    hconf.set(prefix + ".tenant", credentials['project_id'])
    hconf.set(prefix + ".username", credentials['user_id'])
    hconf.set(prefix + ".password", credentials['password'])
    hconf.setInt(prefix + ".http.port", 8080)
    hconf.set(prefix + ".region", credentials['region'])
    hconf.setBoolean(prefix + ".public", True)

credentials = {
  'auth_url':'https://identity.open.softlayer.com',
  'project':'object_storage_bcc6ba38_7399_4aed_a47c_e6bcdc959163',
  'project_id':'f26ba12177c44e59adbe243b430b3bf5',
  'region':'dallas',
  'user_id':'bb973e5a84da4fce8c62d95f2e1e5d19',
  'domain_id':'bd9453b2e5e2424388e25677cd26a7cf',
  'domain_name':'1062145',
  'username':'admin_a16bbb9d8d1d051ba505b6e7e76867f61c9d1ac1',
  'password':"""...""",
  'filename':'2001-2008-merged.csv',
  'container':'notebooks',
  'tenantId':'s090-be5845bf9646f1-3ef81b4dcb61'
}
credentials['name'] = 'FlightDelay_demo2'
set_hadoop_config(credentials)
textFile = sc.textFile("swift://" + credentials['container'] + "." + credentials['name'] + credentials['filename'])

textFileRDD=textFile.map(lambda x: x.split(','))

# Line that throws the error:
header = textFileRDD.first()

I'm submitting the Python application to Spark as follows:

./spark-submit.sh \
  --vcap ./vcap.json \
  --deploy-mode cluster \
  --master https://spark.bluemix.net \
  /resources/FlightDelay_demo2.py

Here's my vcap.json:

{
  "credentials": {
    "tenant_id": "s090-be5845bf9646f1-3ef81b4dcb61",
    "tenant_id_full": "2f61d7ef-955a-41f2-9090-be5845bf9646_dd570054-525b-4659-9af1-3ef81b4dcb61",
    "cluster_master_url": "https://spark.bluemix.net",
    "instance_id": "2f61d7ef-955a-41f2-9090-be5845bf9646",
    "tenant_secret": "...",
    "plan": "ibm.SparkService.PayGoPersonal"
  }
}

Here's the full error:

Traceback (most recent call last):
  File "/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/workdir/spark-driver-b2f25278-f8b4-40e3-8d53-9e8a64228197/FlightDelay_demo2.py", line 94, in <module>
    header = textFileRDD.first()
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 1315, in first
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 1267, in take
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/rdd.py", line 2363, in getNumPartitions
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 813, in __call__
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/pyspark.zip/pyspark/sql/utils.py", line 45, in deco
  File "/usr/local/src/spark160master/spark-1.6.0-bin-2.6.0/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 308, in get_return_value
py4j.protocol.Py4JJavaError: An error occurred while calling o33.partitions.
: org.apache.hadoop.fs.swift.exceptions.SwiftConfigurationException: Invalid host name
    at org.apache.hadoop.fs.swift.util.SwiftUtils.validSchema(SwiftUtils.java:222)
    at org.apache.hadoop.fs.swift.http.SwiftRestClient.<init>(SwiftRestClient.java:510)
    at org.apache.hadoop.fs.swift.http.SwiftRestClient.getInstance(SwiftRestClient.java:1914)
    at org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystemStore.initialize(SwiftNativeFileSystemStore.java:81)
    at org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem.initialize(SwiftNativeFileSystem.java:129)
    at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2596)
    at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:91)
    at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2630)
    at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2612)
    at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:370)
    at org.apache.hadoop.fs.Path.getFileSystem(Path.java:296)
    at org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:256)
    at org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228)
    at org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313)
    at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:199)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
    at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:239)
    at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:237)
    at scala.Option.getOrElse(Option.scala:120)
    at org.apache.spark.rdd.RDD.partitions(RDD.scala:237)
    at org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:64)
    at org.apache.spark.api.java.AbstractJavaRDDLike.partitions(JavaRDDLike.scala:46)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:95)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:55)
    at java.lang.reflect.Method.invoke(Method.java:507)
    at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
    at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
    at py4j.Gateway.invoke(Gateway.java:259)
    at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
    at py4j.commands.CallCommand.execute(CallCommand.java:79)
    at py4j.GatewayConnection.run(GatewayConnection.java:209)
    at java.lang.Thread.run(Thread.java:785)

I thought it might be relevant, so here's what the Spark configuration (sorted(sc._conf.getAll())) looks like within the Python application:

[(u'spark.app.name', u'/resources/FlightDelay_demo2.py'),
 (u'spark.driver.extraClassPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'),
 (u'spark.driver.extraLibraryPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'),
 (u'spark.eventLog.dir', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/events'),
 (u'spark.eventLog.enabled', u'true'),
 (u'spark.executor.extraClassPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'),
 (u'spark.executor.extraLibraryPath', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/libs/*:'),
 (u'spark.executor.instances', u'2'),
 (u'spark.executor.memory', u'1024m'),
 (u'spark.files', u'/gpfs/fs01/user/s090-be5845bf9646f1-3ef81b4dcb61/data/128249272b758216b946308d5f6ea43ca033a85a/FlightDelay_demo2.py'),
 (u'spark.files.useFetchCache', u'false'),
 (u'spark.master', u'spark://yp-spark-dal09-env5-0020:7083'),
 (u'spark.rdd.compress', u'True'),
 (u'spark.serializer.objectStreamReset', u'100'),
 (u'spark.service.hashed_tenant_id', u'25vT74pNDbLRr98mnGe81k9FmiS9NRiyELS04g=='),
 (u'spark.service.plan_name', u'ibm.SparkService.PayGoPersonal'),
 (u'spark.service.spark_version', u'1.6.0'),
 (u'spark.shuffle.service.port', u'7340'),
 (u'spark.submit.pyFiles', u'null'),
 (u'spark.ui.port', u'0')]
Leo
  • 1,493
  • 14
  • 27

1 Answers1

0

There are two issues, both relating to the swift URL:

  1. credentials['name'] cannot contain characters that are illegal in a hostname like underscores. Remove the underscore as follows:

    credentials['name'] = 'FlightDelayDemo2'
    
  2. There's a missing slash between the hostname and the filename. It needs to be added:

    textFile = sc.textFile("swift://" + credentials['container'] + "." + credentials['name'] + "/" + credentials['filename'])
    

Since I'm answering my own question, I want to mention that another obscure error that you might see while debugging something similar is:

pyspark.sql.utils.IllegalArgumentException: u'Host name may not be null'

Leo
  • 1,493
  • 14
  • 27