PyFlink - specify Table format and process nested JSON string data

Question

I have a JSON data object as such:

{
    "monitorId": 865,
    "deviceId": "94:54:93:49:96:13",
    "data": "{\"0001\":105.0,\"0002\":1.21,\"0003\":0.69,\"0004\":1.46,\"0005\":47.43,\"0006\":103.3}",
    "state": 2,
    "time": 1593687809180
}

The field data is itself a JSON object string. How do I express this schema in terms of Flink's Table API? I have tried creating a UDF that takes in the JSON string and outputs parsed contents. However, I can't find a method to populate DataTypes.ROW objects:

t_env.connect(
    Kafka()
        .version("universal")
        .topic(INPUT_TOPIC)
        .property("bootstrap.servers", PROD_KAFKA)
        .property("zookeeper.connect", PROD_ZOOKEEPER)
        .start_from_latest()
) \
    .with_format(
    Json()
        .json_schema(
        """
        {
    "type": "object",
    "properties": {
        "monitorId": {
            "type": "string"
        },
        "deviceId": {
            "type": "string"
        },
        "data": {
            "type": "string"
        },
        "state": {
            "type": "integer"
        },
        "time": {
            "type": "string"
        }
    }
}
    """
    )
) \
    .with_schema(
    Schema()
        .field("monitorId", DataTypes.STRING())
        .field("deviceId", DataTypes.STRING())
        .field("time", DataTypes.STRING())
        .field("data", DataTypes.STRING())
        .field("state", DataTypes.STRING())
) \
    .register_table_source(INPUT_TABLE)

t_env.connect(Kafka()
              .version("universal")
              .topic(OUTPUT_TOPIC)
              .property("bootstrap.servers", LOCAL_KAFKA)
              .property("zookeeper.connect", LOCAL_ZOOKEEPER)
              .start_from_latest()
              ) \
    .with_format(
    Json()
        .json_schema(
        """
         {
       "type": "object",
       "properties": {
           "monitorId": {
               "type": "string"
           },
           "data": {
               "type": "string"
           },
           "time": {
               "type": "string"
           }   
       }
   }
        """
    )
) \
    .with_schema(
    Schema()
        .field("monitorId", DataTypes.STRING())
        .field("time", DataTypes.STRING())
        .field("data", DataTypes.ROW([DataTypes.FIELD("feature1", DataTypes.STRING())]))
) \
    .register_table_sink(OUTPUT_TABLE)



class DataConverter(ScalarFunction):
    def eval(self, str_data):
        data = json.loads(str_data)
        return ? # <--- how do I populate the DataType.ROW with each individual value from data?

t_env.register_function("data_converter", udf(DataConverter(), input_types = [DataTypes.STRING()],
                                              result_type =
                                              DataTypes.ROW([
                                                  DataTypes.FIELD("feature1", DataTypes.STRING())
                                              ])))

t_env.from_path(INPUT_TABLE) \
    .select("monitorId, time, data_converter(data)") \
    .insert_into(OUTPUT_TABLE)

t_env.execute("IU pyflink job")

score 0 · Answer 1 · answered Jul 10 '20 at 09:47

0

If you want the result type of Python UDF is DataTypes.Row, you can use the Python Class of Row to wrapped it.The class of Row extends from Tuple. You can use the following code to import it: from pyflink.table.types import Row

answered Jul 10 '20 at 09:47

Xingbo Huang

363
1
5

PyFlink - specify Table format and process nested JSON string data

1 Answers1