Google Cloud Data Fusion produces inconsistent output data

Question

I am creating a DataFusion pipeline to ingest a CSV file from s3 bucket, applying wrangler directives and storing it in GCS bucket. The input CSV file had 18 columns. However, the output CSV file has only 8 columns. I have a doubt that this could be due to the CSV encoding format, but I am not sure. What could be the reason here?

Pipeline JSON

{
"name": "aws_fusion_v1",
"description": "Data Pipeline Application",
"artifact": {
    "name": "cdap-data-pipeline",
    "version": "6.1.2",
    "scope": "SYSTEM"
},
"config": {
    "resources": {
        "memoryMB": 2048,
        "virtualCores": 1
    },
    "driverResources": {
        "memoryMB": 2048,
        "virtualCores": 1
    },
    "connections": [
        {
            "from": "Amazon S3",
            "to": "Wrangler"
        },
        {
            "from": "Wrangler",
            "to": "GCS2"
        },
        {
            "from": "Argument Setter",
            "to": "Amazon S3"
        }
    ],
    "comments": [],
    "postActions": [],
    "properties": {},
    "processTimingEnabled": true,
    "stageLoggingEnabled": true,
    "stages": [
        {
            "name": "Amazon S3",
            "plugin": {
                "name": "S3",
                "type": "batchsource",
                "label": "Amazon S3",
                "artifact": {
                    "name": "amazon-s3-plugins",
                    "version": "1.11.0",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "format": "text",
                    "authenticationMethod": "Access Credentials",
                    "filenameOnly": "false",
                    "recursive": "false",
                    "ignoreNonExistingFolders": "false",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}",
                    "referenceName": "aws_source",
                    "path": "${input.bucket}",
                    "accessID": "${input.access_id}",
                    "accessKey": "${input.access_key}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}"
                }
            ],
            "type": "batchsource",
            "label": "Amazon S3",
            "icon": "icon-s3"
        },
        {
            "name": "Wrangler",
            "plugin": {
                "name": "Wrangler",
                "type": "transform",
                "label": "Wrangler",
                "artifact": {
                    "name": "wrangler-transform",
                    "version": "4.1.5",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "field": "*",
                    "precondition": "false",
                    "threshold": "1",
                    "workspaceId": "804a2995-7c06-4ab2-b342-a9a01aa03a3d",
                    "schema": "${output.schema}",
                    "directives": "${directive}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "${output.schema}"
                }
            ],
            "inputSchema": [
                {
                    "name": "Amazon S3",
                    "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"body\",\"type\":\"string\"}]}"
                }
            ],
            "type": "transform",
            "label": "Wrangler",
            "icon": "icon-DataPreparation"
        },
        {
            "name": "GCS2",
            "plugin": {
                "name": "GCS",
                "type": "batchsink",
                "label": "GCS2",
                "artifact": {
                    "name": "google-cloud",
                    "version": "0.14.2",
                    "scope": "SYSTEM"
                },
                "properties": {
                    "project": "auto-detect",
                    "suffix": "yyyy-MM-dd-HH-mm",
                    "format": "csv",
                    "serviceFilePath": "auto-detect",
                    "location": "us",
                    "referenceName": "gcs_sink",
                    "path": "${output.path}",
                    "schema": "${output.schema}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": "${output.schema}"
                }
            ],
            "inputSchema": [
                {
                    "name": "Wrangler",
                    "schema": ""
                }
            ],
            "type": "batchsink",
            "label": "GCS2",
            "icon": "fa-plug"
        },
        {
            "name": "Argument Setter",
            "plugin": {
                "name": "ArgumentSetter",
                "type": "action",
                "label": "Argument Setter",
                "artifact": {
                    "name": "argument-setter-plugins",
                    "version": "1.1.1",
                    "scope": "USER"
                },
                "properties": {
                    "method": "GET",
                    "connectTimeout": "60000",
                    "readTimeout": "60000",
                    "numRetries": "0",
                    "followRedirects": "true",
                    "url": "${argfile}"
                }
            },
            "outputSchema": [
                {
                    "name": "etlSchemaBody",
                    "schema": ""
                }
            ],
            "type": "action",
            "label": "Argument Setter",
            "icon": "fa-plug"
        }
    ],
    "schedule": "0 * * * *",
    "engine": "spark",
    "numOfRecordsPreview": 100,
    "description": "Data Pipeline Application",
    "maxConcurrentRuns": 1
}
}

Edit: The missing columns in the output file were due to spaces in the column names. But I am facing another issue. In wrangler, when I pass a directive as "parse-as-csv :body ',' false", the output file is empty. But when I pass something like "parse-as-csv :body ',' true", the output file has all the data without header as expected.

When you say inconsistent output, is it inconsisten across run of the same pipeline with the same configuration? — Edwin Elia, Jul 29 '20 at 18:48

Google Cloud Data Fusion produces inconsistent output data

0 Answers0