1

My questions could be complicated, but I can explain further if someone helps me in resolving.

I'm writing a udf. I have a dictionary called _readingType={0:("a","b")} there are 300 keys in this dictionary. There is a variabale called reading = int(strIn[j + 2:j + 6], 16) depending on the value of the reading it grabs the value corresponding to that key and appends to the result which is a list.When I'm giving a dummy value in reading it is readingTypeMapping = _readingType.get(readingType) is giving desired output. But when I do it dynamically it is not. And also this is my schema

parse_data_udf = udf(parse_data, ArrayType(StructType([
    StructField("i", StringType(), False),
    StructField("timeStamp", StringType(), False),
    StructField("numEvents", IntegerType(), False),
    StructField("numReadings", IntegerType(), False),
    StructField("readingType", IntegerType(), False),
    StructField("readingTypeMapping", IntegerType(), False),
    StructField("readingValue", StringType(), False)
])))

If I define readingTypeMapping as interger then output would be {"i": "0", "timeStamp": "", "numEvents": 0, "numReadings": 5, "readingType": 63, "readingTypeMapping": 0, "readingValue": "5E"} If I define readingTypeMapping as String then output is NullPointException If I keep readingTypeMapping as String and the pass a test value in the reading let say reading = 83 then the code is executing perfectly and resulting the desired output. What should I do?

This is my code

def parse_data(strIn):
  result = []
  j = 0
  _readingQuality = = {0:("","")}
    
  _readingType = {0:("","")}

  while j < len(strIn) - 1:
    ts = int(strIn[j:j + 8], 16)
    timeStamp = str(datetime.datetime.utcfromtimestamp(ts))
    numEvents = int(strIn[j + 8], 16)
    numReadings = int(strIn[j + 9], 16)
    j += 10
    for i in range(numReadings):
      binary = binary_repr(int(strIn[j:j + 2], 16), 8)
      timeStampPresent = bool(int(binary[0], 2))
      readingQualitiesPresent = bool(int(binary[1], 2))
      pendingPowerOfTen = int(binary[2:5], 2)
      if pendingPowerOfTen == 7:
          pendingPowerOfTen = 9
      readingsValueSizeInBytes = int(binary[5:], 2) + 1
      readingType =  int(strIn[j + 2:j + 6], 16)
      readingTypeMapping  = _readingType.get(readingType)
      rc = readingTypeMapping[0]
      rd = readingTypeMapping[1]
      if timeStampPresent:
        ts = int(strIn[j + 6:j + 14], 16)
        timeStamp = str(datetime.datetime.utcfromtimestamp(ts))
        j = j + 14
      else:
        timeStamp = ''
        j = j + 6

          # If reading qualities are present
      if readingQualitiesPresent:
        q = []
        c = []
        d = []
        finished = False
        while not finished:
          binary = binary_repr(int(strIn[j:j + 2], 16), 8)
          num = int(binary[1:], 2)
          q.append(num)
          t = _readingQuality.get(num,(None,None))
          c.append(t[0])
          d.append(t[1])
          if binary[0] == '0':
              finished = True  # No more reading quality codes present, can stop
          else:
              j = j + 2
        j = j + 2

        # 2 characters of string = 1 byte
      readingValue = strIn[j:j + readingsValueSizeInBytes * 2]
      j = j + readingsValueSizeInBytes * 2
          
      result.append({
          "i": str(i),
          "timeStamp": timeStamp,
          "numEvents": numEvents,
          "numReadings": numReadings,
          "readingType": readingType,
          "readingTypeMapping" : readingTypeMapping,
          "rc":rc,
          "rd":rd,
          "readingValue": readingValue
      })
  return result

Create UDF

parse_data_udf = udf(parse_data, ArrayType(StructType([
    StructField("i", StringType(), False),
    StructField("timeStamp", StringType(), False),
    StructField("numEvents", IntegerType(), False),
    StructField("numReadings", IntegerType(), False),
    StructField("readingType", IntegerType(), False),
    StructField("readingTypeMapping", IntegerType(), False),
    StructField("rc", StringType(), False),
    StructField("rc", StringType(), False),
    StructField("readingValue", StringType(), False)
])))


# Apply the UDF and explode the array of structs into separate rows
df_parsed = df.withColumn("parsed", parse_data_udf(col("PL")))
df_exploded = df_parsed.select("PL", explode(col("parsed")).alias("parsed"))


# Show the resulting DataFrame
display(df_exploded)

PythonException: 'TypeError: 'NoneType' object is not subscriptable', from , line 386. Full traceback below:

Sai Pavan
  • 11
  • 2
  • can you please add some sample data & expected output ? – Srinivas Aug 16 '23 at 06:07
  • Sure Sample data: data = [("6486A2CC0518003F5E000042001800465E000187200900A109B236587B9480519003F01E20000420035004601E2000187210900A109B164869BC40518003F770000420018004677000187210900A109B3",)] columns = ["PL"] df = spark.createDataFrame(data, columns) Sample output: {"i": "0", "timeStamp": "", "numEvents": 0, "numReadings": 5, "readingType": 63, "readingTypeMapping": key, "rc":value,"rd": value "readingValue": "5E"} – Sai Pavan Aug 16 '23 at 13:08

0 Answers0