1

I have a json file as the following:

{
  "123": [
    {
      "id": "123",
      "info": {
        "op": {
          "m": 1,
          "q": 2
        },
        "li": [
          "a",
          "b"
        ],
        "ad": [
          {
            "m": 1,
            "q": 2,
            "t": "text"
          },
          {
            "m": 1,
            "q": 2,
            "t": "abc"
          }
        ]
      },
      "dt": 1532494800000,
      "et": 1532494800000
    },
    {
      "id": "123",
      "info": {
        "op": {
          "m": 2,
          "q": 1
        },
        "li": [
          "a",
          "b"
        ],
        "ad": [
          {
            "m": 2,
            "q": 1,
            "t": "atext"
          },
          {
            "m": 10,
            "q": 2,
            "t": "abc"
          }
        ]
      },
      "dt": 1532494800000,
      "et": 1532494800000
    }
  ]
}

As the json object starts with a variable, how do I write a schema for this ? For each json in a file, spark creates new schema object. Isn't is performance bottleneck ?

The json is present in a file as unstructured e.g.

{"123":[{"id":"123","info":{"op":{"m":1,"q":2},"li":["a","b"],"ad":[{"m":1,"q":2,"t":"text"},{"m":1,"q":2,"t":"abc"}]},"dt":1532494800000,"et":1532494800000},{"id":"123","info":{"op":{"m":2,"q":1},"li":["a","b"],"ad":[{"m":2,"q":1,"t":"atext"},{"m":10,"q":2,"t":"abc"}]},"dt":1532494800000,"et":1532494800000}]}

Each new line containing a json object. This is what I have till now:

public JavaRDD<MyObject> parseRecordFile(String path) {
    JavaRDD<Row> jsonRdd = getJsonRdd(path);
    JavaRDD<MyObject> map = jsonRdd.map(JsonReader::parseJsonStructure);
    return map;
  }

  public void jsonSchemaSpark() {
    //Don't know what to put here.
  }

  private JavaRDD<Row> getJsonRdd(String path) {
    Dataset<Row> jsonDS = sparkSession.read().format("json").load(path);
    return jsonDS.toJavaRDD();
  }

  private static MyObject parseJsonStructure(Row row) {
    log.info("Row starting");
    log.info("One row {}", row);
    log.info("Row end");
    return new MyObject();
  }

Does one row resembles one json object as in the file per line.

Pranaya Behera
  • 545
  • 1
  • 9
  • 24

0 Answers0